def main(argv): role = get_role() # 只在一个master controller节点执行此命令 cluster_info = get_cluster_info() if role != ROLE_CONTROLLER or \ int(cluster_info["sid"]) != MASTER_CONTROLLER_SID: logger.info("queue action[%s] not handler on non-master [%s|%s].", argv, role, cluster_info["sid"]) return parser = ArgsParser() ret = parser.parse(argv) if not ret: sys.exit(40) if parser.action in ACTION_MAP: if parser.directive: queue = parser.directive["queue"] if isinstance(queue, str): queue = json_loads(queue) if queue is None: logger.error("Failed to load queue[%s] to json!", parser.directive) sys.exit(40) ret = ACTION_MAP[parser.action](queue) else: ret = ACTION_MAP[parser.action]() sys.exit(ret) else: logger.error("can not handle the action[%s].", parser.action) sys.exit(40)
def main(argv): try: # 只在一个master controller节点执行此命令 role = get_role() cluster_info = get_cluster_info() if role != ROLE_CONTROLLER or \ int(cluster_info["sid"]) != MASTER_CONTROLLER_SID: return parser = ArgsParser() ret = parser.parse(argv) if not ret: sys.exit(40) if parser.action in ACTION_MAP: if parser.directive: software = parser.directive["software"] if isinstance(software, str): software = json_loads(software) if software is None: logger.error("Failed to load software[%s] to json!", parser.directive) sys.exit(40) ret = ACTION_MAP[parser.action](software) else: ret = ACTION_MAP[parser.action]() sys.exit(ret) else: logger.error("can not handle the action[%s].", parser.action) sys.exit(40) except Exception: logger.error("Failed to update software: [%s]", traceback.format_exc()) sys.exit(1)
def health_check(): role = get_role() services = ROLE_SERVICES.get(role, "") for service in services: ret = check_service_status(service) if ret != 0: return ret return 0
def stop(): role = get_role() if role in ROLE_SERVICES: for service in ROLE_SERVICES[role]: run_shell("systemctl stop {}".format(service)) else: logger.error("Un-support role[%s].", role) return 1 return 0
def restart(): role = get_role() if role in ROLE_SERVICES: for service in ROLE_SERVICES[role]: run_shell("systemctl restart {}".format(service)) else: logger.error("Un-support role[%s].", role) return 1 logger.info("%s re-started.", role) return 0
def metadata_reload(): logger.info("generate hosts for reloading..") generate_hosts() role = get_role() if role == ROLE_CONTROLLER: logger.info("update slurm conf for reloading metadata..") update_slurm_conf() # TODO: 多controller节点时,只在一个master节点执行此命令即可 logger.info("re-config slurm configuration for cluster..") run_shell("scontrol reconfigure") return 0
def start(): role = get_role() nas_mount_point = get_nas_mount_point() cluster_name = get_cluster_name() # mkdir /nas_mount_point/opt/slurm/state_save_loc for StateSaveLocation run_shell("mkdir -p {}/opt/slurm/state_save_loc/{}/".format(nas_mount_point, cluster_name)) run_shell("ln -sf {}/opt/slurm/ /opt/slurm".format(nas_mount_point)) # start before if role == ROLE_CONTROLLER: logger.info("Generating slurm configurations...") generate_slurm_conf() else: for f in clear_files[role]: if path.exists(f): run_shell("rm {}".format(f)) # start service if role in ROLE_SERVICES: for service in ROLE_SERVICES[role]: logger.info("Start service {}".format(service)) run_shell("systemctl start {}".format(service)) else: logger.error("Un-support role[%s].", role) return 1 # start post cluster_info = get_cluster_info() nas_mount_point = get_nas_mount_point() if role == ROLE_CONTROLLER and \ int(cluster_info["sid"]) == MASTER_CONTROLLER_SID: logger.info("create admin dirs..") run_shell("mkdir -p {}/opt".format(nas_mount_point)) run_shell("mkdir -p {}/home/".format(nas_mount_point)) run_shell("mkdir -p {}/data/".format(nas_mount_point)) # create admin user add_admin_user() # install software return init_software() logger.info("%s started.", role) return 0
def service_in(hostname): c = components[get_role(hostname)] c.restart(hostname) c.service_in(hostname)
def service_out(hostname): components[get_role(hostname)].service_out(hostname)