def test_abbrev_nodenames_nochange_cobalt(): env_resources = EnvResources() # Test Cobalt abbrev exp_names = ['21', '22', '137', '138', '1234', '11234'] env_resources.schedular = 'Cobalt' abbrev_names = env_resources.abbrev_nodenames(exp_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources
def test_abbrev_nodenames_nochange_slurm(): env_resources = EnvResources() # Test Slurm abbrev exp_names = ['knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345'] env_resources.schedular = 'Slurm' abbrev_names = env_resources.abbrev_nodenames(exp_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources
def test_abbrev_nodenames_slurm(): env_resources = EnvResources() # Test Slurm abbrev exp_names = ['knl-0019', 'knl-0021', 'knl-0022'] full_names = ['knl-0019.some.suffix', 'knl-0021.some.suffix', 'knl-0022.diff_suffix'] env_resources.schedular = 'Slurm' abbrev_names = env_resources.abbrev_nodenames(full_names) assert abbrev_names == exp_names, "Abbreviated names returned do not match expected" del env_resources
class Resources: """Provides system resources to libEnsemble and executor. This is intialized when the executor is created with auto_resources set to true. **Object Attributes:** These are set on initialization. :ivar string top_level_dir: Directory where searches for node_list file :ivar boolean central_mode: If true, then running in central mode; otherwise distributed :ivar EnvResources env_resources: An object storing environment variables used by resources :ivar list global_nodelist: A list of all nodes available for running user applications :ivar int logical_cores_avail_per_node: Logical cores (including SMT threads) available on a node :ivar int physical_cores_avail_per_node: Physical cores available on a node :ivar WorkerResources worker_resources: An object that can contain worker specific resources """ DEFAULT_NODEFILE = 'node_list' def __init__(self, top_level_dir=None, central_mode=False, allow_oversubscribe=False, launcher=None, cores_on_node=None, node_file=None, nodelist_env_slurm=None, nodelist_env_cobalt=None, nodelist_env_lsf=None, nodelist_env_lsf_shortform=None): """Initializes a new Resources instance Determines the compute resources available for current allocation, including node list and cores/hardware threads available within nodes. Parameters ---------- top_level_dir: string, optional Directory libEnsemble runs in (default is current working directory) central_mode: boolean, optional If true, then running in central mode, otherwise distributed. Central mode means libE processes (manager and workers) are grouped together and do not share nodes with applications. Distributed mode means Workers share nodes with applications. allow_oversubscribe: boolean, optional If false, then resources will raise an error if task process counts exceed the CPUs available to the worker, as detected by auto_resources. Larger node counts will always raise an error. When auto_resources is off, this argument is ignored. launcher: String, optional The name of the job launcher, such as mpirun or aprun. This may be used to obtain intranode information by launching a probing job onto the compute nodes. If not present, the local node will be used to obtain this information. cores_on_node: tuple (int,int), optional If supplied gives (physical cores, logical cores) for the nodes. If not supplied, this will be auto-detected. node_file: String, optional If supplied, give the name of a file in the run directory to use as a node-list for use by libEnsemble. Defaults to a file named 'node_list'. If the file does not exist, then the node-list will be auto-detected. nodelist_env_slurm: String, optional The environment variable giving a node list in Slurm format (Default: uses SLURM_NODELIST). Note: This is queried only if a node_list file is not provided and auto_resources=True. nodelist_env_cobalt: String, optional The environment variable giving a node list in Cobalt format (Default: uses COBALT_PARTNAME). Note: This is queried only if a node_list file is not provided and auto_resources=True. nodelist_env_lsf: String, optional The environment variable giving a node list in LSF format (Default: uses LSB_HOSTS). Note: This is queried only if a node_list file is not provided and auto_resources=True. nodelist_env_lsf_shortform: String, optional The environment variable giving a node list in LSF short-form format (Default: uses LSB_MCPU_HOSTS) Note: This is only queried if a node_list file is not provided and auto_resources=True. """ self.top_level_dir = top_level_dir or os.getcwd() self.central_mode = central_mode if self.central_mode: logger.debug('Running in central mode') self.allow_oversubscribe = allow_oversubscribe self.env_resources = EnvResources(nodelist_env_slurm=nodelist_env_slurm, nodelist_env_cobalt=nodelist_env_cobalt, nodelist_env_lsf=nodelist_env_lsf, nodelist_env_lsf_shortform=nodelist_env_lsf_shortform) # This is global nodelist avail to workers - may change to global_worker_nodelist if node_file is None: node_file = Resources.DEFAULT_NODEFILE self.global_nodelist = Resources.get_global_nodelist(node_file=node_file, rundir=self.top_level_dir, env_resources=self.env_resources) self.launcher = launcher remote_detect = False if socket.gethostname() not in self.global_nodelist: remote_detect = True if not cores_on_node: cores_on_node = \ node_resources.get_sub_node_resources(launcher=self.launcher, remote_mode=remote_detect, env_resources=self.env_resources) self.physical_cores_avail_per_node = cores_on_node[0] self.logical_cores_avail_per_node = cores_on_node[1] self.libE_nodes = None self.worker_resources = None def add_comm_info(self, libE_nodes): """Adds comms-specific information to resources Removes libEnsemble nodes from nodelist if in central_mode. """ self.libE_nodes = self.env_resources.abbrev_nodenames(libE_nodes) libE_nodes_in_list = list(filter(lambda x: x in self.libE_nodes, self.global_nodelist)) if libE_nodes_in_list: if self.central_mode and len(self.global_nodelist) > 1: self.global_nodelist = Resources.remove_nodes(self.global_nodelist, self.libE_nodes) if not self.global_nodelist: logger.warning("Warning. Node-list for tasks is empty. Remove central_mode or add nodes") def set_worker_resources(self, workerid, comm): self.worker_resources = WorkerResources(workerid, comm, self) @staticmethod def get_MPI_variant(): """Returns MPI base implementation Returns ------- mpi_variant: string: MPI variant 'aprun' or 'jsrun' or 'mpich' or 'openmpi' """ try: subprocess.check_call(['aprun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) return 'aprun' except OSError: pass try: subprocess.check_call(['jsrun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) return 'jsrun' except OSError: pass try: # Explore mpi4py.MPI.get_vendor() and mpi4py.MPI.Get_library_version() for mpi4py try_mpich = subprocess.Popen(['mpirun', '-npernode'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, _ = try_mpich.communicate() if 'unrecognized argument npernode' in stdout.decode(): return 'mpich' return 'openmpi' except Exception: pass try: subprocess.check_call(['srun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) return 'srun' except OSError: pass # --------------------------------------------------------------------------- # This is for central mode where libE nodes will not share with app nodes @staticmethod def remove_nodes(global_nodelist_in, remove_list): """Removes any nodes in remove_list from the global nodelist""" global_nodelist = list(filter(lambda x: x not in remove_list, global_nodelist_in)) return global_nodelist @staticmethod def best_split(a, n): """Creates the most even split of list a into n parts and return list of lists""" k, m = divmod(len(a), n) return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) @staticmethod def get_global_nodelist(node_file=DEFAULT_NODEFILE, rundir=None, env_resources=None): """ Returns the list of nodes available to all libEnsemble workers. If a node_file exists this is used, otherwise the environment is interrogated for a node list. If a dedicated manager node is used, then a node_file is recommended. In central mode, any node with a libE worker is removed from the list. """ top_level_dir = rundir or os.getcwd() node_filepath = os.path.join(top_level_dir, node_file) global_nodelist = [] if os.path.isfile(node_filepath): logger.debug("node_file found - getting nodelist from node_file") with open(node_filepath, 'r') as f: for line in f: global_nodelist.append(line.rstrip()) else: logger.debug("No node_file found - searching for nodelist in environment") if env_resources: global_nodelist = env_resources.get_nodelist() if not global_nodelist: # Assume a standalone machine logger.info("Can not find nodelist from environment. Assuming standalone") global_nodelist.append(socket.gethostname()) if global_nodelist: return global_nodelist raise ResourcesException("Error. global_nodelist is empty")