Exemplo n.º 1
0
def generate_nodes_list(hostfile=None):
    if hostfile is None:
        hostname = platform.node()
        logger.debug(
            "No `hostfile` provided. Using only the current node `{}`".format(
                hostname))
        return [hostname]

    if not os.path.isfile(hostfile):
        raise ValueError(
            "Incorrect `hostfile` provided with path `{}`".format(hostfile))

    nodes_list = []
    try:
        with open(hostfile, 'r') as f:
            nodes_list = f.readlines()
    except:
        raise ValueError(
            "Cannot read from `hostfile` with path `{}`".format(hostfile))

    if len(nodes_list) == 0:
        raise ValueError("Empty `hostfile` with path `{}`".format(hostfile))

    non_empty_nodes = [
        node.strip() for node in nodes_list if len(node.strip()) > 0
    ]
    unique_nodes = list(dict.fromkeys(non_empty_nodes))
    if len(non_empty_nodes) != len(unique_nodes):
        logger.debug(
            "The `hostfile` does not contain only unique hostnames; removing duplicates."
        )
    return unique_nodes
Exemplo n.º 2
0
def generate_num_gpus_per_node(npernode=None):
    num_physical_gpus = len(tf_config.get_available_gpus())
    logger.debug("Num GPUs Available: {}".format(num_physical_gpus))

    if npernode is None:  # use as many GPUs as possible
        num_devices = num_physical_gpus

    else:  # the user requested a specific number of devices
        if num_physical_gpus < npernode:
            logger.debug(
                "Not enough GPUs for the requested {} devices per node".format(
                    npernode))
            num_devices = 0
        else:
            num_devices = npernode
    return num_devices
Exemplo n.º 3
0
 def clean_up_run(self):
   cleanup_script = self.generate_cleanup_script()
   hostfile = file_man.HostFile(self.hostlist, 1)
   self.execute_with_gaspi_run(len(self.hostlist), hostfile, cleanup_script, self.args.dry_run)
   logger.debug(f'Cleanup executed on {len(self.hostlist)} nodes.')