示例#1
0
def setup_compute():
    """run compute node setup"""
    log.info("Setting up compute")
    util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600)
    install_custom_scripts()

    setup_nss_slurm()
    setup_network_storage()

    # template = lkp.node_template_info(zone=lkp.zone)

    # if (not cfg.instance_defs[pid].image_hyperthreads and
    #         shutil.which('google_mpi_tuning')):
    #     run("google_mpi_tuning --nosmt")
    has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l",
                  shell=True).returncode
    if has_gpu:
        run("nvidia-smi")

    run_custom_scripts()

    setup_slurmd_cronjob()
    run("systemctl restart munge", timeout=30)
    run("systemctl enable slurmd", timeout=30)
    run("systemctl restart slurmd", timeout=30)

    run("systemctl enable slurmeventd", timeout=30)
    run("systemctl restart slurmeventd", timeout=30)

    log.info("Check status of cluster services")
    run("systemctl status munge", timeout=30)
    run("systemctl status slurmd", timeout=30)
    run("systemctl status slurmeventd", timeout=30)

    log.info("Done setting up compute")
示例#2
0
def gen_cloud_gres_conf(lkp=lkp):
    """generate cloud_gres.conf"""

    gpu_nodes = defaultdict(list)
    for part_name, partition in lkp.cfg.partitions.items():
        for node in partition.partition_nodes.values():
            template_info = lkp.template_info(node.instance_template)
            gpu_count = template_info.gpu_count
            if gpu_count == 0:
                continue
            gpu_nodes[gpu_count].extend(
                filter(None, nodeset_lists(node, part_name)))

    lines = [
        dict_to_conf({
            "NodeName":
            names,
            "Name":
            "gpu",
            "File":
            "/dev/nvidia{}".format(f"[0-{i-1}]" if i > 1 else "0"),
        }) for i, names in gpu_nodes.items()
    ]
    lines.append("\n")
    content = FILE_PREAMBLE + "\n".join(lines)

    conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_gres.conf"
    conf_file_bak = conf_file.with_suffix(".conf.bak")
    if conf_file.is_file():
        shutil.copy2(conf_file, conf_file_bak)
    conf_file.write_text(content)
    util.chown_slurm(conf_file, mode=0o600)
示例#3
0
def fetch_devel_scripts():
    """download scripts from project metadata if they are present"""

    meta_json = project_metadata(f"{cfg.slurm_cluster_name}-slurm-devel")
    if not meta_json:
        return
    metadata_devel = json.loads(meta_json)

    meta_entries = [
        ("slurmeventd.py", "slurmeventd"),
        ("resume.py", "slurm-resume"),
        ("slurmsync.py", "slurmsync"),
        ("util.py", "util-script"),
        ("setup.py", "setup-script"),
        ("startup.sh", "startup-script"),
        ("load_bq.py", "loadbq"),
    ]

    for script, name in meta_entries:
        if name not in metadata_devel:
            log.debug(f"{name} not found in project metadata, not updating")
            continue
        log.info(f"updating {script} from metadata")
        content = metadata_devel[name]
        path = (dirs.scripts / script).resolve()
        # make sure parent dir exists
        path.write_text(content)
        util.chown_slurm(path, mode=0o755)
示例#4
0
def install_slurm_conf(lkp):
    """install slurm.conf"""
    if lkp.cfg.ompi_version:
        mpi_default = "pmi2"
    else:
        mpi_default = "none"

    conf_options = {
        "name": lkp.cfg.slurm_cluster_name,
        "control_host": lkp.control_host,
        "scripts": dirs.scripts,
        "slurmlog": dirs.log,
        "state_save": slurmdirs.state,
        "mpi_default": mpi_default,
    }
    conf_resp = project_metadata(
        f"{cfg.slurm_cluster_name}-slurm-tpl-slurm-conf")
    conf = conf_resp.format(**conf_options)

    conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "slurm.conf"
    conf_file_bak = conf_file.with_suffix(".conf.bak")
    if conf_file.is_file():
        shutil.copy2(conf_file, conf_file_bak)
    conf_file.write_text(conf)
    util.chown_slurm(conf_file, mode=0o644)
示例#5
0
def setup_jwt_key():
    jwt_key = Path(slurmdirs.state / "jwt_hs256.key")

    if jwt_key.exists():
        log.info("JWT key already exists. Skipping key generation.")
    else:
        run("dd if=/dev/urandom bs=32 count=1 > " + str(jwt_key), shell=True)

    util.chown_slurm(jwt_key, mode=0o400)
示例#6
0
def gen_cloud_conf(lkp=lkp, cloud_parameters=None):
    content = make_cloud_conf(lkp, cloud_parameters=cloud_parameters)

    conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud.conf"
    conf_file_bak = conf_file.with_suffix(".conf.bak")
    if conf_file.is_file():
        shutil.copy2(conf_file, conf_file_bak)
    conf_file.write_text(content)
    util.chown_slurm(conf_file, mode=0o644)
示例#7
0
def install_cgroup_conf():
    """install cgroup.conf"""
    conf = project_metadata(f"{cfg.slurm_cluster_name}-slurm-tpl-cgroup-conf")

    conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cgroup.conf"
    conf_file_bak = conf_file.with_suffix(".conf.bak")
    if conf_file.is_file():
        shutil.copy2(conf_file, conf_file_bak)
    conf_file.write_text(conf)
    util.chown_slurm(conf_file, mode=0o600)
示例#8
0
def configure_dirs():

    for p in dirs.values():
        p.mkdirp()
    util.chown_slurm(dirs.slurm)
    util.chown_slurm(dirs.scripts)

    for p in slurmdirs.values():
        p.mkdirp()
        util.chown_slurm(p)

    etc_slurm = Path("/etc/slurm")
    if etc_slurm.exists() and etc_slurm.is_symlink():
        etc_slurm.unlink()
    etc_slurm.symlink_to(slurmdirs.etc)

    scripts_etc = dirs.scripts / "etc"
    if scripts_etc.exists() and scripts_etc.is_symlink():
        scripts_etc.unlink()
    scripts_etc.symlink_to(slurmdirs.etc)

    scripts_log = dirs.scripts / "log"
    if scripts_log.exists() and scripts_log.is_symlink():
        scripts_log.unlink()
    scripts_log.symlink_to(dirs.log)
示例#9
0
def install_slurmdbd_conf(lkp):
    """install slurmdbd.conf"""
    conf_options = NSDict({
        "control_host": lkp.control_host,
        "slurmlog": dirs.log,
        "state_save": slurmdirs.state,
        "db_name": "slurm_acct_db",
        "db_user": "******",
        "db_pass": '******',
        "db_host": "localhost",
        "db_port": "3306",
    })
    if lkp.cfg.cloudsql:
        secret_name = f"{cfg.slurm_cluster_name}-slurm-secret-cloudsql"
        payload = json.loads(access_secret_version(util.project, secret_name))

        if payload["db_name"] and payload["db_name"] != "":
            conf_options.db_name = payload["db_name"]
        if payload["user"] and payload["user"] != "":
            conf_options.db_user = payload["user"]
        if payload["password"] and payload["password"] != "":
            conf_options.db_pass = payload["password"]

        db_host_str = payload["server_ip"].split(":")
        if db_host_str[0] and db_host_str[0] != "":
            conf_options.db_host = db_host_str[0]
            conf_options.db_port = db_host_str[1] if len(
                db_host_str) >= 2 else "3306"

    conf_resp = project_metadata(
        f"{cfg.slurm_cluster_name}-slurm-tpl-slurmdbd-conf")
    conf = conf_resp.format(**conf_options)

    conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "slurmdbd.conf"
    conf_file_bak = conf_file.with_suffix(".conf.bak")
    if conf_file.is_file():
        shutil.copy2(conf_file, conf_file_bak)
    conf_file.write_text(conf)
    util.chown_slurm(conf_file, 0o600)
示例#10
0
import logging
import re
from time import sleep
import setup
import sys
import util
from collections import namedtuple
from pathlib import Path
from google.cloud import pubsub_v1
from util import project, lkp, cfg
from util import config_root_logger, handle_exception, run, publish_message

filename = Path(__file__).name
logfile = (Path(cfg.slurm_log_dir if cfg else ".") /
           filename).with_suffix(".log")
util.chown_slurm(logfile, mode=0o600)
config_root_logger(filename,
                   level="DEBUG",
                   util_level="DEBUG",
                   logfile=logfile)
log = logging.getLogger(filename)

project_id = project
subscription_id = lkp.hostname

subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path(project_id, subscription_id)

StateTuple = namedtuple("StateTuple", "base,flags")

示例#11
0
    help="Force attempted creation of the nodelist, whether nodes are exclusive or not.",
)
parser.add_argument(
    "--debug", "-d", dest="debug", action="store_true", help="Enable debugging output"
)


if __name__ == "__main__":
    if "SLURM_JOB_NODELIST" in os.environ:
        argv = [
            *sys.argv[1:],
            os.environ["SLURM_JOB_NODELIST"],
            os.environ["SLURM_JOB_ID"],
        ]
        args = parser.parse_args(argv)
    else:
        args = parser.parse_args()

    util.chown_slurm(LOGFILE, mode=0o600)
    if args.debug:
        util.config_root_logger(
            filename, level="DEBUG", util_level="DEBUG", logfile=LOGFILE
        )
    else:
        util.config_root_logger(
            filename, level="INFO", util_level="ERROR", logfile=LOGFILE
        )
    sys.excepthook = util.handle_exception

    main(args.nodelist, args.job_id, args.force)
示例#12
0
def setup_controller():
    """Run controller setup"""
    log.info("Setting up controller")
    util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600)
    install_custom_scripts()

    install_slurm_conf(lkp)
    install_slurmdbd_conf(lkp)

    gen_cloud_conf()
    gen_cloud_gres_conf()
    install_gres_conf()
    install_cgroup_conf()

    setup_jwt_key()
    setup_munge_key()

    if cfg.controller_secondary_disk:
        setup_secondary_disks()
    setup_network_storage()

    run_custom_scripts()

    if not cfg.cloudsql:
        configure_mysql()

    run("systemctl enable slurmdbd", timeout=30)
    run("systemctl restart slurmdbd", timeout=30)

    # Wait for slurmdbd to come up
    time.sleep(5)

    sacctmgr = f"{slurmdirs.prefix}/bin/sacctmgr -i"
    result = run(f"{sacctmgr} add cluster {cfg.slurm_cluster_name}",
                 timeout=30,
                 check=False)
    if "already exists" in result.stdout:
        log.info(result.stdout)
    elif result.returncode > 1:
        result.check_returncode()  # will raise error

    run("systemctl enable slurmctld", timeout=30)
    run("systemctl restart slurmctld", timeout=30)

    run("systemctl enable slurmrestd", timeout=30)
    run("systemctl restart slurmrestd", timeout=30)

    # Export at the end to signal that everything is up
    run("systemctl enable nfs-server", timeout=30)
    run("systemctl start nfs-server", timeout=30)

    run("systemctl enable slurmeventd", timeout=30)
    run("systemctl restart slurmeventd", timeout=30)

    setup_nfs_exports()
    setup_sync_cronjob()

    log.info("Check status of cluster services")
    run("systemctl status munge", timeout=30)
    run("systemctl status slurmdbd", timeout=30)
    run("systemctl status slurmctld", timeout=30)
    run("systemctl status slurmrestd", timeout=30)
    run("systemctl status slurmeventd", timeout=30)

    slurmsync.sync_slurm()
    run("systemctl enable slurm_load_bq.timer", timeout=30)
    run("systemctl start slurm_load_bq.timer", timeout=30)
    run("systemctl status slurm_load_bq.timer", timeout=30)

    log.info("Done setting up controller")
    pass
示例#13
0
def install_custom_scripts(clean=False):
    """download custom scripts from project metadata"""
    script_pattern = re.compile(
        rf"{cfg.slurm_cluster_name}-slurm-(?P<path>\S+)-script-(?P<name>\S+)")
    metadata_keys = project_metadata("/").splitlines()

    def match_name(meta_key):
        m = script_pattern.match(meta_key)
        if not m:
            # key does not match, skip
            return None
        # returned path is `partition.d/<part_name>/<name>`
        # or `<controller/compute>.d/<name>`
        parts = m["path"].split("-")
        parts[0] += ".d"
        name, _, ext = m["name"].rpartition("_")
        name = ".".join((name, ext))
        return meta_key, Path(*parts, name)

    def filter_role(meta_entry):
        if not meta_entry:
            return False
        key, path = meta_entry
        # path is <role>.d/script.sh or partition.d/<part>/script.sh
        # role is <role> or 'partition', part is None or <part>
        role, part, *_ = chain(path.parent.parts, (None, ))
        role = role[:-2]  # strip off added '.d'

        # login only needs their login scripts
        if lkp.instance_role == "login":
            suffix = instance_metadata("attributes/slurm_login_suffix")
            script_types = [f"login_{suffix}"]
            return role in script_types
        # compute needs compute, prolog, epilog, and the matching partition
        if lkp.instance_role == "compute":
            script_types = ["compute", "prolog", "epilog"]
            return role in script_types or (part and part
                                            == lkp.node_partition_name())
        # controller downloads them all for good measure
        return True

    custom_scripts = list(filter(filter_role, map(match_name, metadata_keys)))
    log.info("installing custom scripts: {}".format(",".join(
        str(path) for key, path in custom_scripts)))

    if clean:
        path = Path(dirs.custom_scripts)
        if path.exists() and path.is_dir():
            # rm -rf custom_scripts
            shutil.rmtree(path)

    dirs.custom_scripts.mkdirp()
    for key, path in custom_scripts:
        fullpath = (dirs.custom_scripts / path).resolve()
        fullpath.parent.mkdirp()
        for par in path.parents:
            util.chown_slurm(dirs.custom_scripts / par)
        log.debug(path)
        content = project_metadata(key)
        fullpath.write_text(content)
        util.chown_slurm(fullpath, mode=0o755)
示例#14
0
def install_gres_conf():
    conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_gres.conf"
    gres_conf = Path(lkp.cfg.output_dir or slurmdirs.etc) / "gres.conf"
    if not gres_conf.exists():
        gres_conf.symlink_to(conf_file)
    util.chown_slurm(gres_conf, mode=0o600)