def test_abbrev_nodenames_nochange_cobalt(): env_resources = EnvResources() # Test Cobalt abbrev exp_names = ['21', '22', '137', '138', '1234', '11234'] env_resources.schedular = 'Cobalt' abbrev_names = env_resources.abbrev_nodenames(exp_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources
def test_abbrev_nodenames_nochange_slurm(): env_resources = EnvResources() # Test Cobalt abbrev exp_names = [ 'knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345' ] env_resources.schedular = 'Cobalt' abbrev_names = env_resources.abbrev_nodenames(exp_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources
class Resources: """Provide system resources to libEnsemble and job controller. This is intialised when the job_controller is created with auto_resources set to True. **Object Attributes:** These are set on initialisation. :ivar string top_level_dir: Directory where searches for worker_list file. :ivar boolean central_mode: If true, then running in central mode, else distributed. :ivar EnvResources env_resources: An object storing environment variables used by resources. :ivar list global_nodelist: A list of all nodes available for running user applications :ivar int logical_cores_avail_per_node: Logical cores (including SMT threads) available on a node. :ivar int physical_cores_avail_per_node: Physical cores available on a node. :ivar WorkerResources worker_resources: An object that can contain worker specific resources. """ def __init__(self, top_level_dir=None, central_mode=False, launcher=None, nodelist_env_slurm=None, nodelist_env_cobalt=None, nodelist_env_lsf=None, nodelist_env_lsf_shortform=None): """Initialise new Resources instance Works out the compute resources available for current allocation, including node list and cores/hardware threads available within nodes. Parameters ---------- top_level_dir: string, optional Directory libEnsemble runs in (default is current working directory) central_mode: boolean, optional If true, then running in central mode, else distributed. Central mode means libE processes (manager and workers) are grouped together and do not share nodes with applications. Distributed mode means Workers share nodes with applications. launcher: String, optional The name of the job launcher such as mpirun or aprun. This may be used to obtain intra-node information by launching a probing job onto the compute nodes. If not present, the local node will be used to obtain this information. nodelist_env_slurm: String, optional The environment variable giving a node list in Slurm format (Default: Uses SLURM_NODELIST) Note: This is only queried if a worker_list file is not provided and auto_resources=True. nodelist_env_cobalt: String, optional The environment variable giving a node list in Cobalt format (Default: Uses COBALT_PARTNAME) Note: This is only queried if a worker_list file is not provided and auto_resources=True. nodelist_env_lsf: String, optional The environment variable giving a node list in LSF format (Default: Uses LSB_HOSTS) Note: This is only queried if a worker_list file is not provided and auto_resources=True. nodelist_env_lsf_shortform: String, optional The environment variable giving a node list in LSF short-form format (Default: Uses LSB_MCPU_HOSTS) Note: This is only queried if a worker_list file is not provided and auto_resources=True. """ self.top_level_dir = top_level_dir or os.getcwd() self.central_mode = central_mode if self.central_mode: logger.debug('Running in central mode') self.env_resources = EnvResources( nodelist_env_slurm=nodelist_env_slurm, nodelist_env_cobalt=nodelist_env_cobalt, nodelist_env_lsf=nodelist_env_lsf, nodelist_env_lsf_shortform=nodelist_env_lsf_shortform) # This is global nodelist avail to workers - may change to global_worker_nodelist self.global_nodelist = Resources.get_global_nodelist( rundir=self.top_level_dir, env_resources=self.env_resources) remote_detect = False if socket.gethostname() not in self.global_nodelist: remote_detect = True cores_info = node_resources.get_sub_node_resources( launcher=launcher, remote_mode=remote_detect, env_resources=self.env_resources) self.logical_cores_avail_per_node = cores_info[0] self.physical_cores_avail_per_node = cores_info[1] self.libE_nodes = None self.worker_resources = None def add_comm_info(self, libE_nodes): """Add comms specific information to resources Removes libEnsemble nodes from nodelist if in central_mode. """ self.libE_nodes = self.env_resources.abbrev_nodenames(libE_nodes) libE_nodes_in_list = list( filter(lambda x: x in self.libE_nodes, self.global_nodelist)) if libE_nodes_in_list: if self.central_mode and len(self.global_nodelist) > 1: self.global_nodelist = Resources.remove_nodes( self.global_nodelist, self.libE_nodes) if not self.global_nodelist: logger.warning( "Warning. Node-list for sub-jobs is empty. Remove central_mode or add nodes" ) def set_worker_resources(self, workerid, comm): self.worker_resources = WorkerResources(workerid, comm, self) @staticmethod def get_MPI_variant(): """Returns MPI base implementation Returns ------- mpi_variant: string: MPI variant 'aprun' or 'jsrun' or 'mpich' or 'openmpi' """ try: subprocess.check_call(['aprun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) return 'aprun' except OSError: pass try: subprocess.check_call(['jsrun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) return 'jsrun' except OSError: pass try: # Explore mpi4py.MPI.get_vendor() and mpi4py.MPI.Get_library_version() for mpi4py try_mpich = subprocess.Popen(['mpirun', '-npernode'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, _ = try_mpich.communicate() if 'unrecognized argument npernode' in stdout.decode(): return 'mpich' return 'openmpi' except Exception: pass try: subprocess.check_call(['srun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) return 'srun' except OSError: pass # --------------------------------------------------------------------------- # This is for central mode where libE nodes will not share with app nodes @staticmethod def remove_nodes(global_nodelist_in, remove_list): """Any nodes in remove_list are removed from the global nodelist""" global_nodelist = list( filter(lambda x: x not in remove_list, global_nodelist_in)) return global_nodelist @staticmethod def best_split(a, n): """Create the most even split of list a into n parts and return list of lists""" k, m = divmod(len(a), n) return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) @staticmethod def get_global_nodelist(rundir=None, env_resources=None): """ Return the list of nodes available to all libEnsemble workers If a worker_list file exists this is used, otherwise the environment is interrogated for a node list. If a dedicated manager node is used, then a worker_list file is recommended. In central mode, any node with a libE worker is removed from the list. """ top_level_dir = rundir or os.getcwd() worker_list_file = os.path.join(top_level_dir, 'worker_list') global_nodelist = [] if os.path.isfile(worker_list_file): logger.debug( "worker_list found - getting nodelist from worker_list") with open(worker_list_file, 'r') as f: for line in f: global_nodelist.append(line.rstrip()) else: logger.debug( "No worker_list found - searching for nodelist in environment") if env_resources: global_nodelist = env_resources.get_nodelist() if not global_nodelist: # Assume a standalone machine logger.info( "Can not find nodelist from environment. Assuming standalone" ) global_nodelist.append(socket.gethostname()) if global_nodelist: return global_nodelist raise ResourcesException("Error. global_nodelist is empty")