def generate_nodes_list(hostfile=None): if hostfile is None: hostname = platform.node() logger.debug( "No `hostfile` provided. Using only the current node `{}`".format( hostname)) return [hostname] if not os.path.isfile(hostfile): raise ValueError( "Incorrect `hostfile` provided with path `{}`".format(hostfile)) nodes_list = [] try: with open(hostfile, 'r') as f: nodes_list = f.readlines() except: raise ValueError( "Cannot read from `hostfile` with path `{}`".format(hostfile)) if len(nodes_list) == 0: raise ValueError("Empty `hostfile` with path `{}`".format(hostfile)) non_empty_nodes = [ node.strip() for node in nodes_list if len(node.strip()) > 0 ] unique_nodes = list(dict.fromkeys(non_empty_nodes)) if len(non_empty_nodes) != len(unique_nodes): logger.debug( "The `hostfile` does not contain only unique hostnames; removing duplicates." ) return unique_nodes
def generate_num_gpus_per_node(npernode=None): num_physical_gpus = len(tf_config.get_available_gpus()) logger.debug("Num GPUs Available: {}".format(num_physical_gpus)) if npernode is None: # use as many GPUs as possible num_devices = num_physical_gpus else: # the user requested a specific number of devices if num_physical_gpus < npernode: logger.debug( "Not enough GPUs for the requested {} devices per node".format( npernode)) num_devices = 0 else: num_devices = npernode return num_devices
def clean_up_run(self): cleanup_script = self.generate_cleanup_script() hostfile = file_man.HostFile(self.hostlist, 1) self.execute_with_gaspi_run(len(self.hostlist), hostfile, cleanup_script, self.args.dry_run) logger.debug(f'Cleanup executed on {len(self.hostlist)} nodes.')