def remote_config_update(config, args, check_module=False): ''' client end(infra/NFS node) config file update ./ctl.py -s svc configupdate restfulapi ./ctl.py [-r storage_machine1 [-r storage_machine2]] -s svc configupdate storage_manager by default sudo ''' if check_module: assert set(args.nargs[1:]) - set([ "restfulapi", "storagemanager", "repairmanager", "dashboard" ]) == set(), "not supported" # need to get node list for this subcommand of svc, so load status.yaml if not os.path.exists(FILE_MAP_PATH): utils.render_template("template/cloud-config/file_map.yaml", FILE_MAP_PATH, config) with open(FILE_MAP_PATH) as f: file_map = yaml.load(f) for module in args.nargs[1:]: if module == "jobmanager": module = "restfulapi" if module in ["restfulapi", "dashboard", "repairmanager"]: render_func = eval("render_{}".format(module)) render_func(config) infra_nodes, _ = load_node_list_by_role_from_config( config, ["infra"], False) for file_pair in file_map[module]: src_dst_list = [file_pair["src"], file_pair["dst"]] execute_in_parallel(config, infra_nodes, src_dst_list, True, copy2_wrapper, noSupressWarning=args.verbose) elif module == "storagemanager": nfs_nodes, _ = load_node_list_by_role_from_config( config, ["nfs"], False) for node in nfs_nodes: config["storage_manager"] = config["machines"][node][ "storage_manager"] render_storagemanager(config, node) src_dst_list = [ "./deploy/StorageManager/{}_storage_manager.yaml".format( node), "/etc/StorageManager/config.yaml" ] args_list = (config["machines"][node]["fqdns"], config["ssh_cert"], config["admin_username"], src_dst_list, True, args.verbose) copy2_wrapper(args_list)
def dynamically_add_or_delete_around_a_num(config, args): # need some time for the newly added worker to register monitor_again_after = config.get("monitor_again_after", 10) while True: # TODO currently don't keep history of operation here. or name the bash by time? os.system("rm -f {}".format(args.output)) config = load_config_based_on_command("dynamic_around") dynamic_worker_num = config.get("dynamic_worker_num", -1) if dynamic_worker_num < 0: print( "This round would be skipped. Please specify dynamic_worker_num in config." ) os.system("sleep {}m".format(monitor_again_after)) continue query_cmds = "get nodes -l worker=active --no-headers | awk '{print $1}'" k8s_worker_nodes = get_k8s_node_list_under_condition( config, args, query_cmds) worker_in_records, config = load_node_list_by_role_from_config( config, ["worker"], False) print("worker in records:\n", worker_in_records) print( "Dynamically scaling number of workers:\n {}/{} worker nodes registered in k8s, targeting {}" .format(len(k8s_worker_nodes), len(worker_in_records), dynamic_worker_num)) delta = dynamic_worker_num - len(worker_in_records) if delta > 0: add_n_machines(config, args, delta) elif delta < 0: delete_specified_or_cordoned_idling_nodes(config, args, -delta) os.system("sleep {}m".format(monitor_again_after))
def create_nfs_nsg(config, args): nfs_nsg_name = config["azure_cluster"]["nfs_nsg_name"] resource_group = config["azure_cluster"]["resource_group"] nfs_ports = config["cloud_config_nsg_rules"]["nfs_ports"] nfs_nodes, config = load_node_list_by_role_from_config(config, ["nfs"]) infra_nodes, config = load_node_list_by_role_from_config(config, ["infra"]) if len(set(nfs_nodes) - set(infra_nodes)): cmd = """az network nsg create --resource-group {} --name {}""".format( resource_group, nfs_nsg_name) execute_or_dump_locally(cmd, args.verbose, args.dryrun, args.output) priority = 1700 # set nsg rules for devs, (and samba, since samba machines are all in corpnet) for tag in config["cloud_config_nsg_rules"]["service_tags"]: create_nsg_rule(resource_group, nfs_nsg_name, priority, "NFS-Allow-Dev-{}".format(tag), nfs_ports, tag, args) priority += 1
def connect_to_machine(config, args): if args.nargs[0] in config['allroles']: target_role = args.nargs[0] index = int(args.nargs[1]) if len(args.nargs) > 1 else 0 nodes, _ = load_node_list_by_role_from_config(config, [target_role]) node = nodes[index] else: node = args.nargs[0] assert node in config["machines"] utils.SSH_connect(config["ssh_cert"], config["machines"][node]["admin_username"], config["machines"][node]["fqdns"])
def get_multiple_machines(config, args): valid_roles = set(config['allroles']) & set(args.roles_or_machine) valid_machine_names = set(config['machines']) & set(args.roles_or_machine) invalid_rom = set(args.roles_or_machine) - \ valid_roles - valid_machine_names if invalid_rom: print( "Warning: invalid roles/machine names detected, the following names \\\ are neither valid role names nor machines in our cluster: " + ",".join(list(invalid_rom))) nodes, _ = load_node_list_by_role_from_config(config, list(valid_roles), False) return nodes + list(valid_machine_names)
def run_kubectl(config, args, commands, need_output=False, dump_to_file=''): if not os.path.exists("./deploy/bin/kubectl"): print( "please make sure ./deploy/bin/kubectl exists. One way is to use ./ctl.py download" ) exit(-1) one_command = " ".join(commands) nodes, _ = load_node_list_by_role_from_config(config, ["infra"], False) master_node = random.choice(nodes) kube_command = "./deploy/bin/kubectl --server=https://{}:{} --certificate-authority={} --client-key={} --client-certificate={} {}".format( config["machines"][master_node]["fqdns"], config["k8sAPIport"], "./deploy/ssl/ca/ca.pem", "./deploy/ssl/kubelet/apiserver-key.pem", "./deploy/ssl/kubelet/apiserver.pem", one_command) if need_output: # we may want to dump command to another file instead of args.output, when we don't want to mix k8s commands with others output = utils.execute_or_dump_locally(kube_command, args.verbose, args.dryrun, dump_to_file) if not args.verbose: print(output) return output else: os.system(kube_command)