def test_get_global_nodelist_frm_wrklst_file(): # node_list file should override env variables os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = "20-22,137-139,1234" # Should not be this exp_out = ['knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345'] # Should be this open('node_list', 'w').close() try: _ = Resources.get_global_nodelist(rundir=os.getcwd()) except ResourcesException as e: assert e.args[0] == 'Error. global_nodelist is empty' else: assert 0 with open('node_list', 'w') as f: for node in exp_out: f.write(node + '\n') # Do not specify env vars. global_nodelist1 = Resources.get_global_nodelist(rundir=os.getcwd()) assert global_nodelist1 == exp_out, "global_nodelist returned does not match expected" # Specify env vars - should ignore env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_cobalt="LIBE_RESOURCES_TEST_NODE_LIST", nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET") global_nodelist2 = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources) assert global_nodelist2 == exp_out, "global_nodelist returned does not match expected" os.remove('node_list')
def test_get_local_nodelist_central_mode_remove_libE_proc(): mynode = socket.gethostname() nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234'] with open('node_list', 'w') as f: for i, node in enumerate(nodelist_in): f.write(node + '\n') if i == 3: f.write(mynode + '\n') resources = Resources(central_mode=True) resources.add_comm_info(libE_nodes=[mynode]) # Now mock up some more stuff - so consistent # Spoof current process as each worker and check nodelist. num_workers = 8 exp_out = [['knl-0020'], ['knl-0021'], ['knl-0022'], ['knl-0036'], ['knl-0137'], ['knl-0138'], ['knl-0139'], ['knl-1234']] for wrk in range(num_workers): workerID = wrk + 1 local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected" # Spoof current process as each worker and check nodelist. num_workers = 4 exp_out = [['knl-0020', 'knl-0021'], ['knl-0022', 'knl-0036'], ['knl-0137', 'knl-0138'], ['knl-0139', 'knl-1234']] for wrk in range(num_workers): workerID = wrk + 1 local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected" # Spoof current process as each worker and check nodelist. num_workers = 1 exp_out = [['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']] for wrk in range(num_workers): workerID = wrk + 1 local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected" # Test the best_split algorithm num_workers = 3 exp_out = [['knl-0020', 'knl-0021', 'knl-0022'], ['knl-0036', 'knl-0137', 'knl-0138'], ['knl-0139', 'knl-1234']] for wrk in range(num_workers): workerID = wrk + 1 local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected" os.remove('node_list')
def test_remove_libE_nodes(): mynode = socket.gethostname() exp_out = ['knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345'] # Add at beginning nodes_in = [mynode] + exp_out nodes_out = Resources.remove_nodes(nodes_in, mynode) assert nodes_out == exp_out, "nodelist returned does not match expected" # Add twice in middle and at end nodes_in = [] for i, node in enumerate(exp_out): nodes_in.append(node) if i == 1 or i == 4 or i == 6: nodes_in.append(mynode) nodes_out = Resources.remove_nodes(nodes_in, mynode) assert nodes_out == exp_out, "nodelist returned does not match expected"
def test_get_global_nodelist_standalone(): mynode = socket.gethostname() exp_node = mynode # sname(mynode) env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_cobalt="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET") global_nodelist = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources) assert global_nodelist == [exp_node], "global_nodelist returned does not match expected"
def test_get_global_nodelist_frm_lsf_shortform(): os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = 'batch5 1 g06n02 42 h21n18 42' exp_out = ['g06n02', 'h21n18'] env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_cobalt="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_lsf_shortform="LIBE_RESOURCES_TEST_NODE_LIST") global_nodelist = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources) assert global_nodelist == exp_out, "global_nodelist returned does not match expected"
def test_get_global_nodelist_frm_cobalt(): os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = "20-22,137-139,1234" exp_out = ['20', '21', '22', '137', '138', '139', '1234'] env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_cobalt="LIBE_RESOURCES_TEST_NODE_LIST", nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET", nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET") global_nodelist = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources) assert global_nodelist == exp_out, "global_nodelist returned does not match expected"
def test_get_local_nodelist_distrib_mode(): mynode = socket.gethostname() # nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234'] nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139'] with open('node_list', 'w') as f: for i, node in enumerate(nodelist_in): f.write(node + '\n') if i == 3: f.write(mynode + '\n') resources = Resources(central_mode=False) # Spoof current process as each worker and check nodelist. num_workers = 8 # Test workerID not in local_nodelist [update: This should now work - check removed] # workerID = 4 # try: # local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) # except: # assert 1 # else: # assert 0 workerID = 5 exp_node = mynode # sname(mynode) exp_out = [exp_node] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" num_workers = 1 workerID = 1 exp_out = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', exp_node, 'knl-0137', 'knl-0138', 'knl-0139'] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" num_workers = 4 workerID = 3 exp_out = [exp_node, 'knl-0137'] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" # Sub-node workers num_workers = 16 workerID = 9 exp_out = [exp_node] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" workerID = 10 exp_out = [exp_node] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" os.remove('node_list')
def test_get_local_nodelist_central_mode(): os.environ[ "LIBE_RESOURCES_TEST_NODE_LIST"] = "knl-[0020-0022,0036,0137-0139,1234]" resources = Resources(nodelist_env_slurm="LIBE_RESOURCES_TEST_NODE_LIST", central_mode=True) # Now mock up some more stuff - so consistent # Spoof current process as each worker and check nodelist. num_workers = 8 exp_out = [['knl-0020'], ['knl-0021'], ['knl-0022'], ['knl-0036'], ['knl-0137'], ['knl-0138'], ['knl-0139'], ['knl-1234']] for wrk in range(num_workers): workerID = wrk + 1 local_nodelist = WorkerResources.get_local_nodelist( num_workers, workerID, resources) assert local_nodelist == exp_out[ wrk], "local_nodelist returned does not match expected" # Spoof current process as each worker and check nodelist. num_workers = 4 exp_out = [['knl-0020', 'knl-0021'], ['knl-0022', 'knl-0036'], ['knl-0137', 'knl-0138'], ['knl-0139', 'knl-1234']] for wrk in range(num_workers): workerID = wrk + 1 local_nodelist = WorkerResources.get_local_nodelist( num_workers, workerID, resources) assert local_nodelist == exp_out[ wrk], "local_nodelist returned does not match expected" # Spoof current process as each worker and check nodelist. num_workers = 1 exp_out = [[ 'knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234' ]] for wrk in range(num_workers): workerID = wrk + 1 local_nodelist = WorkerResources.get_local_nodelist( num_workers, workerID, resources) assert local_nodelist == exp_out[ wrk], "local_nodelist returned does not match expected" # Test the best_split algorithm num_workers = 3 exp_out = [['knl-0020', 'knl-0021', 'knl-0022'], ['knl-0036', 'knl-0137', 'knl-0138'], ['knl-0139', 'knl-1234']] for wrk in range(num_workers): workerID = wrk + 1 local_nodelist = WorkerResources.get_local_nodelist( num_workers, workerID, resources) assert local_nodelist == exp_out[ wrk], "local_nodelist returned does not match expected"
def test_get_local_nodelist_distrib_mode_host_not_in_list(): os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = "knl-[0020-0022,0036,0137-0139,1234]" resources = Resources(nodelist_env_slurm="LIBE_RESOURCES_TEST_NODE_LIST", central_mode=False) # Spoof current process as each worker and check nodelist. num_workers = 4 exp_out = ['knl-0022', 'knl-0036'] # Test running distributed mode without current host in list. workerID = 2 local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) # Now this should work assert local_nodelist == exp_out, "local_nodelist returned does not match expected"
def test_worker_resources(): os.environ[ "LIBE_RESOURCES_TEST_NODE_LIST"] = "knl-[0020-0022,0036,0137-0139,1234]" resources = Resources(nodelist_env_slurm="LIBE_RESOURCES_TEST_NODE_LIST", central_mode=True) # One worker per node exp_nodelist1 = [['knl-0020'], ['knl-0021'], ['knl-0022'], ['knl-0036'], ['knl-0137'], ['knl-0138'], ['knl-0139'], ['knl-1234']] num_workers = 8 comm = Fake_comm(num_workers) for wrk in range(num_workers): workerID = wrk + 1 worker = WorkerResources(workerID, comm, resources) assert worker.num_workers == 8, 'worker.num_workers does not match' assert worker.workerID == workerID, 'worker.workerID does not match' assert worker.local_nodelist == exp_nodelist1[ wrk], 'worker.local_nodelist does not match' assert worker.local_node_count == 1, 'worker.local_node_count does not match' assert worker.workers_per_node == 1, 'worker.workers_per_node does not match' # Multiple nodes per worker exp_nodelist2 = [['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036'], ['knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']] num_workers = 2 comm2 = Fake_comm(num_workers) for wrk in range(num_workers): workerID = wrk + 1 worker = WorkerResources(workerID, comm2, resources) assert worker.num_workers == 2, 'worker.num_workers does not match' assert worker.workerID == workerID, 'worker.workerID does not match' assert worker.local_nodelist == exp_nodelist2[ wrk], 'worker.local_nodelist does not match' assert worker.local_node_count == 4, 'worker.local_node_count does not match' assert worker.workers_per_node == 1, 'worker.workers_per_node does not match' # Multiple workers per node num_workers = 16 comm3 = Fake_comm(num_workers) for wrk in range(num_workers): workerID = wrk + 1 worker = WorkerResources(workerID, comm3, resources) assert worker.num_workers == 16, 'worker.num_workers does not match' assert worker.workerID == workerID, 'worker.workerID does not match' assert worker.local_nodelist == exp_nodelist1[ wrk // 2], 'worker.local_nodelist does not match' assert worker.local_node_count == 1, 'worker.local_node_count does not match' assert worker.workers_per_node == 2, 'worker.workers_per_node does not match'
def test_get_local_nodelist_distrib_mode_uneven_split(): mynode = socket.gethostname() exp_node = mynode # sname(mynode) nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234'] with open('node_list', 'w') as f: for i, node in enumerate(nodelist_in): f.write(node + '\n') if i == 4: f.write(mynode + '\n') resources = Resources(central_mode=False) num_workers = 2 # May not be at head of list - should perhaps be warning or enforced workerID = 2 exp_out = ['knl-0137', exp_node, 'knl-0138', 'knl-0139'] local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources) assert local_nodelist == exp_out, "local_nodelist returned does not match expected" os.remove('node_list')
def set_worker_info(self, comm, workerid=None): """Sets info for this executor""" self.workerID = workerid if not self.resources: self.resources = Resources() self.resources.set_worker_resources(self.workerID, comm)
class SerialExecutor(Executor): def __init__(self): super().__init__() self._launch_with_retries = MPIExecutor._launch_with_retries self.max_launch_attempts = 5 self.fail_time = 2 self.retry_delay_incr = 5 def add_comm_info(self, libE_nodes, serial_setup): """Adds comm-specific information to executor. Updates resources information if auto_resources is true. """ self.resources.add_comm_info(libE_nodes=libE_nodes) # if serial_setup: # self._serial_setup() def set_worker_info(self, comm, workerid=None): """Sets info for this executor""" self.workerID = workerid if not self.resources: self.resources = Resources() self.resources.set_worker_resources(self.workerID, comm) def submit(self, calc_type=None, app_name=None, app_args=None, stdout=None, stderr=None, dry_run=False, wait_on_run=False): if app_name is not None: app = self.get_app(app_name) elif calc_type is not None: app = self.default_app(calc_type) else: raise ExecutorException("Either app_name or calc_type must be set") default_workdir = os.getcwd() task = Task(app, app_args, default_workdir, stdout, stderr, self.workerID) runline = [] runline.extend(task.app.full_path.split()) if task.app_args is not None: runline.extend(task.app_args.split()) task.runline = ' '.join(runline) # Allow to be queried if dry_run: task.dry_run = True logger.info('Test (No submit) Runline: {}'.format( ' '.join(runline))) task._set_complete(dry_run=True) else: # Launch Task self._launch_with_retries(task, runline, subgroup_launch=False, wait_on_run=wait_on_run) if not task.timer.timing: task.timer.start() task.submit_time = task.timer.tstart # Time not date - may not need if using timer. self.list_of_tasks.append(task) return task