def RunCommand(command): dataHandler = DataHandler() k8sUtils.kubectl_exec("exec %s %s" % (command["jobId"], command["command"])) dataHandler.FinishCommand(command["id"]) dataHandler.Close() return True
def setup_tensorboard(user_name, pod_name): tensorboard_port = random.randint(40000, 49999) bash_script = "bash -c 'export DEBIAN_FRONTEND=noninteractive; pip install tensorboard; runuser -l " + user_name + " -c \"mkdir -p ~/tensorboard/\${DLWS_JOB_ID}/logs; nohup tensorboard --logdir=~/tensorboard/\${DLWS_JOB_ID}/logs --port=" + str(tensorboard_port) + " &>/dev/null &\"'" output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) if output == "": raise Exception("Failed to start tensorboard in container. JobId: %s " % pod_name) return tensorboard_port
def start_ssh_server(pod_name, user_name, host_network=False, ssh_port=22): '''Setup the ssh server in container, and return the listening port.''' bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'" # ssh_port = 22 # modify the script for HostNewtork if host_network: # if the ssh_port is default value 22, randomly choose one if ssh_port == 22: ssh_port = random.randint(40000, 49999) # bash_script = "sed -i '/^Port 22/c Port "+str(ssh_port)+"' /etc/ssh/sshd_config && "+bash_script # TODO refine the script later bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && sed -i \"s/^Port 22/Port " + str( ssh_port ) + "/\" /etc/ssh/sshd_config && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'" # TODO setup reasonable timeout # output = k8sUtils.kubectl_exec("exec %s %s" % (jobId, " -- " + bash_script), 1) output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) if output == "": raise Exception("Failed to setup ssh server in container. JobId: %s " % pod_name) return ssh_port
def is_ssh_server_ready(pod_name): bash_script = "sudo service ssh status" output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) if output == "": return False return True
def is_user_ready(pod_name): bash_script = "bash -c 'ls /dlws/USER_READY'" output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) if output == "": return False return True
def setup_jupyter_server(user_name, pod_name): jupyter_port = random.randint(40000, 49999) bash_script = "bash -c 'export DEBIAN_FRONTEND=noninteractive; apt-get update && apt-get install -y python3-pip && python3 -m pip install --upgrade pip && python3 -m pip install jupyter && cd /home/" + user_name + " && runuser -l " + user_name + " -c \"jupyter notebook --no-browser --ip=0.0.0.0 --NotebookApp.token= --port=" + str(jupyter_port) + " &>/dev/null &\"'" output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) if output == "": raise Exception("Failed to start jupyter server in container. JobId: %s " % pod_name) return jupyter_port
def start_ssh_server(pod_name): '''Setup the ssh server in container, and return the listening port.''' bash_script = "service ssh start" # assume ssh server already setup # TODO setup reasonable timeout # output = k8sUtils.kubectl_exec("exec %s %s" % (jobId, " -- " + bash_script), 1) output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) if output == "": raise Exception("Failed to setup ssh server in container. JobId: %s " % pod_name)
def RunCommand(command): dataHandler = DataHandler() logging.info("Job %s exec command: [%s]" % (command["jobId"], command["command"])) output = k8sUtils.kubectl_exec("exec %s %s" % (command["jobId"], command["command"])) logging.info("exec output:\n %s" % (output)) dataHandler.FinishCommand(command["id"], output) dataHandler.Close() return True
def query_ssh_port(pod_name): bash_script = "\"grep ^Port /usr/etc/sshd_config | cut -d' ' -f2\"" # status_code, output = deployer.pod_exec(pod_name, ["/bin/bash", "-c", bash_script]) output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + "/bin/bash" + " -c " + bash_script)) if output == "": raise RuntimeError("Query ssh port failed: {}".format(pod_name)) # if status_code != 0: # raise RuntimeError("Query ssh port failed: {}".format(pod_name)) if not output: return 22 return int(output)
def is_server_ready(endpoint): pod_name = endpoint["podName"] port_name = endpoint["name"] cmd = None if port_name == "ipython": cmd = "ps -ef|grep jupyter-lab" elif port_name == "tensorboard": cmd = "ps -ef|grep tensorboard" elif port_name == "vscode": cmd = "ps -ef|grep code-server" if cmd: output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + cmd)) if output == "": return False return True
def query_ssh_port(pod_name): bash_script = "grep ^Port /etc/ssh/sshd_config | cut -d' ' -f2" ssh_port = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script)) return int(ssh_port)
def get_cluster_status(): cluster_status = {} gpuStr = "alpha.kubernetes.io/nvidia-gpu" try: output = k8sUtils.kubectl_exec(" get nodes -o yaml") nodeInfo = yaml.load(output) nodes_status = {} user_status = {} if "items" in nodeInfo: for node in nodeInfo["items"]: node_status = {} node_status["name"] = node["metadata"]["name"] node_status["labels"] = node["metadata"]["labels"] if (gpuStr in node["status"]["allocatable"]): node_status["gpu_allocatable"] = int( node["status"]["allocatable"][gpuStr]) else: node_status["gpu_allocatable"] = 0 if (gpuStr in node["status"]["capacity"]): node_status["gpu_capacity"] = int( node["status"]["capacity"][gpuStr]) else: node_status["gpu_capacity"] = 0 node_status["gpu_used"] = 0 node_status["InternalIP"] = "unknown" node_status["pods"] = [] if "addresses" in node["status"]: for addr in node["status"]["addresses"]: if addr["type"] == "InternalIP": node_status["InternalIP"] = addr["address"] node_status["scheduled_service"] = [] for l, s in node_status["labels"].iteritems(): if s == "active" and l != "all" and l != "default": node_status["scheduled_service"].append(l) if "unschedulable" in node["spec"] and node["spec"][ "unschedulable"]: node_status["unschedulable"] = True else: node_status["unschedulable"] = False if "status" in node and "conditions" in node["status"]: for condi in node["status"]: if "type" in condi and condi[ "type"] == "Ready" and "status" in condi and condi[ "status"] == "Unknown": node_status["unschedulable"] = True nodes_status[node_status["name"]] = node_status output = k8sUtils.kubectl_exec(" get pods -o yaml") podsInfo = yaml.load(output) if "items" in podsInfo: for pod in podsInfo["items"]: gpus = 0 username = None if "metadata" in pod and "labels" in pod[ "metadata"] and "userName" in pod["metadata"]["labels"]: username = pod["metadata"]["labels"]["userName"] if "spec" in pod and "nodeName" in pod["spec"]: node_name = pod["spec"]["nodeName"] pod_name = pod["metadata"]["name"] if username is not None: pod_name += " : " + username gpuUsage = get_job_gpu_usage(pod["metadata"]["name"]) if gpuUsage is not None: pod_name += " (gpu usage:" + str(gpuUsage) + "%)" if gpuUsage <= 25: pod_name += "!!!!!!" if "containers" in pod["spec"]: for container in pod["spec"]["containers"]: if "resources" in container and "requests" in container[ "resources"] and gpuStr in container[ "resources"]["requests"]: gpus += int( container["resources"]["requests"][gpuStr]) pod_name += " (gpu #:" + container[ "resources"]["requests"][gpuStr] + ")" if node_name in nodes_status: nodes_status[node_name]["gpu_used"] += gpus nodes_status[node_name]["pods"].append(pod_name) if username is not None: if username not in user_status: user_status[username] = gpus else: user_status[username] += gpus gpu_avaliable = 0 gpu_reserved = 0 gpu_capacity = 0 gpu_unschedulable = 0 gpu_schedulable = 0 gpu_used = 0 for node_name, node_status in nodes_status.iteritems(): if node_status["unschedulable"]: gpu_unschedulable += node_status["gpu_capacity"] else: gpu_avaliable += (node_status["gpu_allocatable"] - node_status["gpu_used"]) gpu_schedulable += node_status["gpu_capacity"] gpu_unschedulable += (node_status["gpu_capacity"] - node_status["gpu_allocatable"]) gpu_reserved += (node_status["gpu_capacity"] - node_status["gpu_allocatable"]) gpu_used += node_status["gpu_used"] gpu_capacity += node_status["gpu_capacity"] cluster_status["user_status"] = [] for user_name, user_gpu in user_status.iteritems(): cluster_status["user_status"].append({ "userName": user_name, "userGPU": user_gpu }) cluster_status["gpu_avaliable"] = gpu_avaliable cluster_status["gpu_capacity"] = gpu_capacity cluster_status["gpu_unschedulable"] = gpu_unschedulable cluster_status["gpu_used"] = gpu_used cluster_status["gpu_reserved"] = gpu_reserved cluster_status["node_status"] = [ node_status for node_name, node_status in nodes_status.iteritems() ] except Exception as e: print e dataHandler = DataHandler() cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount() cluster_status["TotalJobNum"] = dataHandler.GetALLJobsCount() if "cluster_status" in config and check_cluster_status_change( config["cluster_status"], cluster_status): logging.info("updating the cluster status...") dataHandler.UpdateClusterStatus(cluster_status) else: logging.info( "nothing changed in cluster, skipping the cluster status update..." ) config["cluster_status"] = copy.deepcopy(cluster_status) dataHandler.Close() return cluster_status
def get_cluster_status(): cluster_status = {} gpuStr = "nvidia.com/gpu" try: output = k8sUtils.kubectl_exec(" get nodes -o yaml") nodeInfo = yaml.load(output) nodes_status = {} user_status = {} if "items" in nodeInfo: for node in nodeInfo["items"]: node_status = {} node_status["name"] = node["metadata"]["name"] node_status["labels"] = node["metadata"]["labels"] node_status["gpuType"] = "" node_status["scheduled_service"] = [] for l, s in node_status["labels"].iteritems(): if s == "active" and l != "all" and l != "default": node_status["scheduled_service"].append(l) if l == "gpuType": node_status["scheduled_service"].append(s) node_status["gpuType"] = s if (gpuStr in node["status"]["allocatable"]): node_status["gpu_allocatable"] = ResourceInfo({ node_status["gpuType"]: int(node["status"]["allocatable"][gpuStr]) }).ToSerializable() else: node_status["gpu_allocatable"] = ResourceInfo( ).ToSerializable() if (gpuStr in node["status"]["capacity"]): node_status["gpu_capacity"] = ResourceInfo({ node_status["gpuType"]: int(node["status"]["capacity"][gpuStr]) }).ToSerializable() else: node_status["gpu_capacity"] = ResourceInfo( ).ToSerializable() node_status["gpu_used"] = ResourceInfo().ToSerializable() node_status["InternalIP"] = "unknown" node_status["pods"] = [] if "annotations" in node["metadata"]: if "node.alpha/DeviceInformation" in node["metadata"][ "annotations"]: node_info = json.loads( node["metadata"]["annotations"] ["node.alpha/DeviceInformation"]) if (int(node_info["capacity"]["alpha.gpu/numgpu"]) > ResourceInfo( node_status["gpu_capacity"]).TotalCount()): node_status["gpu_capacity"] = ResourceInfo({ node_status["gpuType"]: int(node_info["capacity"]["alpha.gpu/numgpu"]) }).ToSerializable() if (int(node_info["allocatable"]["alpha.gpu/numgpu"]) > ResourceInfo(node_status["gpu_allocatable"] ).TotalCount()): node_status["gpu_allocatable"] = ResourceInfo({ node_status["gpuType"]: int(node_info["allocatable"] ["alpha.gpu/numgpu"]) }).ToSerializable() if "addresses" in node["status"]: for addr in node["status"]["addresses"]: if addr["type"] == "InternalIP": node_status["InternalIP"] = addr["address"] if "unschedulable" in node["spec"] and node["spec"][ "unschedulable"]: node_status["unschedulable"] = True else: node_status["unschedulable"] = False if "status" in node and "conditions" in node["status"]: for condi in node["status"]["conditions"]: if "type" in condi and condi[ "type"] == "Ready" and "status" in condi and condi[ "status"] == "Unknown": node_status["unschedulable"] = True nodes_status[node_status["name"]] = node_status output = k8sUtils.kubectl_exec(" get pods -o yaml") podsInfo = yaml.load(output) if "items" in podsInfo: for pod in podsInfo["items"]: gpus = 0 username = None if "metadata" in pod and "labels" in pod[ "metadata"] and "userName" in pod["metadata"]["labels"]: username = pod["metadata"]["labels"]["userName"] if "spec" in pod and "nodeName" in pod["spec"]: node_name = pod["spec"]["nodeName"] pod_name = pod["metadata"]["name"] if username is not None: pod_name += " : " + username gpuUsage = get_job_gpu_usage(pod["metadata"]["name"]) if gpuUsage is not None: pod_name += " (gpu usage:" + str(gpuUsage) + "%)" if gpuUsage <= 25: pod_name += "!!!!!!" pod_info_cont = {} pod_info_initcont = {} if "annotations" in pod["metadata"]: if "pod.alpha/DeviceInformation" in pod["metadata"][ "annotations"]: pod_info = json.loads( pod["metadata"]["annotations"] ["pod.alpha/DeviceInformation"]) if "runningcontainer" in pod_info: pod_info_cont = pod_info["runningcontainer"] if "initcontainer" in pod_info: pod_info_initcont = pod_info["initcontainer"] if "containers" in pod["spec"]: for container in pod["spec"]["containers"]: containerGPUs = 0 if "resources" in container and "requests" in container[ "resources"] and gpuStr in container[ "resources"]["requests"]: containerGPUs = int( container["resources"]["requests"][gpuStr]) if container["name"] in pod_info_cont: if "requests" in pod_info_cont[container[ "name"]] and "alpha.gpu/numgpu" in pod_info_cont[ container["name"]]["requests"]: containerGPUs = max( int(pod_info_cont[container["name"]] ["requests"]["alpha.gpu/numgpu"]), containerGPUs) gpus += containerGPUs pod_name += " (gpu #:" + str(containerGPUs) + ")" if node_name in nodes_status: nodes_status[node_name]["gpu_used"] = ResourceInfo( nodes_status[node_name]["gpu_used"]).Add( ResourceInfo( {nodes_status[node_name]["gpuType"]: gpus})).ToSerializable() nodes_status[node_name]["pods"].append(pod_name) if username is not None: if username not in user_status: user_status[username] = ResourceInfo( {nodes_status[node_name]["gpuType"]: gpus}) else: user_status[username].Add( ResourceInfo({ nodes_status[node_name]["gpuType"]: gpus })) gpu_avaliable = ResourceInfo() gpu_reserved = ResourceInfo() gpu_capacity = ResourceInfo() gpu_unschedulable = ResourceInfo() gpu_schedulable = ResourceInfo() gpu_used = ResourceInfo() for node_name, node_status in nodes_status.iteritems(): if node_status["unschedulable"]: gpu_unschedulable.Add(ResourceInfo( node_status["gpu_capacity"])) gpu_reserved.Add( ResourceInfo.Difference( ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_used"]))) else: gpu_avaliable.Add( ResourceInfo.Difference( ResourceInfo(node_status["gpu_allocatable"]), ResourceInfo(node_status["gpu_used"]))) gpu_schedulable.Add(ResourceInfo(node_status["gpu_capacity"])) gpu_unschedulable.Add( ResourceInfo.Difference( ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) gpu_reserved.Add( ResourceInfo.Difference( ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) gpu_used.Add(ResourceInfo(node_status["gpu_used"])) gpu_capacity.Add(ResourceInfo(node_status["gpu_capacity"])) cluster_status["user_status"] = [] for user_name, user_gpu in user_status.iteritems(): cluster_status["user_status"].append({ "userName": user_name, "userGPU": user_gpu.ToSerializable() }) cluster_status["gpu_avaliable"] = gpu_avaliable.ToSerializable() cluster_status["gpu_capacity"] = gpu_capacity.ToSerializable() cluster_status["gpu_unschedulable"] = gpu_unschedulable.ToSerializable( ) cluster_status["gpu_used"] = gpu_used.ToSerializable() cluster_status["gpu_reserved"] = gpu_reserved.ToSerializable() cluster_status["node_status"] = [ node_status for node_name, node_status in nodes_status.iteritems() ] except Exception as e: logging.exception("get cluster status") dataHandler = DataHandler() cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount() if "cluster_status" in config and check_cluster_status_change( config["cluster_status"], cluster_status): logging.info("updating the cluster status...") dataHandler.UpdateClusterStatus(cluster_status) else: logging.info( "nothing changed in cluster, skipping the cluster status update..." ) config["cluster_status"] = copy.deepcopy(cluster_status) dataHandler.Close() return cluster_status
def launch_ps_dist_job(jobParams): job_id = jobParams["jobId"] pods = k8sUtils.GetPod("run=" + job_id) # if any pod is not up, return if "items" not in pods or len(pods["items"]) != ( int(jobParams["numpsworker"]) + int(jobParams["numps"])): return # if any pod is not ready, return pod_status = [k8sUtils.check_pod_status(pod) for pod in pods["items"]] if any([status != "Running" for status in pod_status]): return user_name = getAlias(jobParams["userName"]) if "hostNetwork" in jobParams and jobParams["hostNetwork"]: host_network = True else: host_network = False # setup ssh server for [idx, pod] in enumerate(pods["items"]): pod_name = pod["metadata"]["name"] dist_port = pod["metadata"]["labels"]["distPort"] # quit if can't setup ssh server ssh_port = start_ssh_server(pod_name, user_name, host_network, dist_port) # generate ssh config ssh_config = """ Host %s HostName %s Port %s User %s StrictHostKeyChecking no UserKnownHostsFile /dev/null """ sshconfigstr = "" for [idx, pod] in enumerate(pods["items"]): pod_ip = pod["status"]["podIP"] dist_port = pod["metadata"]["labels"]["distPort"] role = pod["metadata"]["labels"]["distRole"] role_idx = pod["metadata"]["labels"]["distRoleIdx"] # TODO hostNetwork if host_network: sshconfigstr += ( ssh_config % (role + "-" + str(role_idx), pod_ip, str(dist_port), user_name) + "\n") else: sshconfigstr += ( ssh_config % (role + "-" + str(role_idx), pod_ip, 22, user_name) + "\n") # config ssh client for [idx, pod] in enumerate(pods["items"]): pod_name = pod["metadata"]["name"] bash_script = "cat > /home/" + user_name + "/.ssh/config <<EOF " + sshconfigstr + "\nEOF" print("override ssh client config: %s" % bash_script) k8sUtils.kubectl_exec( "exec %s -- bash -c \'%s\' ; chown -R %s /home/%s/.ssh/config" % (pod_name, bash_script, user_name, user_name)) # fix ~/.ssh/ folder permission k8sUtils.kubectl_exec( "exec %s -- chmod 600 -R /home/%s/.ssh; chmod 700 /home/%s/.ssh; chown -R %s /home/%s/.ssh/config" % (pod_name, user_name, user_name, user_name, user_name)) # generate hostfile hostfilecontent = "" for [_, pod] in enumerate(pods["items"]): role = pod["metadata"]["labels"]["distRole"] if role == "ps": continue role_idx = pod["metadata"]["labels"]["distRoleIdx"] worker_gpu_num = pod["spec"]["containers"][0]["resources"]["requests"][ "nvidia.com/gpu"] hostfilecontent += "%s slots=%s\n" % ("worker-" + str(role_idx), worker_gpu_num) tmp_hostfile = "/tmp/" + job_id + ".hostfile" with open(tmp_hostfile, 'w') as f: f.write(hostfilecontent + "\n") # write the hostfile for [idx, pod] in enumerate(pods["items"]): pod_name = pod["metadata"]["name"] remotecmd = "cp %s %s:/job/hostfile" % (tmp_hostfile, pod_name) k8sUtils.kubectl_exec(remotecmd) for [idx, pod] in enumerate(pods["items"]): pod_name = pod["metadata"]["name"] k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" % pod_name) # execute user command #k8sUtils.kubectl_exec("exec %s -- bash -c 'runuser -l ${DLWS_USER_NAME} <<EOF_USER_SCRIPT %s \nEOF_USER_SCRIPT'" % (pod_name, jobParams["cmd"])) # update job status dataHandler = DataHandler() dataHandler.UpdateJobTextField(job_id, "jobStatus", "running") dataHandler.Close()
def get_k8s_endpoint(endpoint_description_path): endpoint_description_path = os.path.join(config["storage-mount-path"], endpoint_description_path) return k8sUtils.kubectl_exec("get -o json -f %s" % endpoint_description_path)
def launch_ps_dist_job(jobParams): jobId = jobParams["jobId"] workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId) psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId) if "items" in workerPodInfo and len(workerPodInfo["items"]) == int( jobParams["numpsworker"]) and "items" in psPodInfo and len( psPodInfo["items"]) == int(jobParams["numps"]): podStatus = [ k8sUtils.check_pod_status(pod) for pod in workerPodInfo["items"] + psPodInfo["items"] ] if all([status == "Running" for status in podStatus]): ps_pod_names = [ pod["metadata"]["name"] for pod in psPodInfo["items"] ] worker_pod_names = [ pod["metadata"]["name"] for pod in workerPodInfo["items"] ] ps_pod_ips = [pod["status"]["podIP"] for pod in psPodInfo["items"]] worker_pod_ips = [ pod["status"]["podIP"] for pod in workerPodInfo["items"] ] ps_num = len(psPodInfo["items"]) worker_num = len(workerPodInfo["items"]) ps_ports = [ int(item["metadata"]["labels"]["distPort"]) for item in psPodInfo["items"] ] worker_ports = [ int(item["metadata"]["labels"]["distPort"]) for item in workerPodInfo["items"] ] #port range: 30000~31000 #rndList = range(max(1000,ps_num + worker_num)) #random.shuffle(rndList) #ps_ports = [rndList[i] + 30000 for i in range(ps_num)] #worker_ports = [rndList[i + ps_num] + 30000 for i in range(worker_num)] ps_hosts = ",".join([ "%s:%s" % (ps_pod_ips[i], ps_ports[i]) for i in range(ps_num) ]) worker_hosts = ",".join([ "%s:%s" % (worker_pod_ips[i], worker_ports[i]) for i in range(worker_num) ]) ps_files = ["/tmp/" + str(uuid.uuid4()) for i in range(ps_num)] worker_files = [ "/tmp/" + str(uuid.uuid4()) for i in range(worker_num) ] ps_cmd = [ "%s --ps_hosts=%s --worker_hosts=%s --job_name=ps --task_index=%d 2>&1 | tee %s" % (jobParams["cmd"], ps_hosts, worker_hosts, i, ps_files[i]) for i in range(ps_num) ] worker_cmd = [ "%s --ps_hosts=%s --worker_hosts=%s --job_name=worker --task_index=%d 2>&1 | tee %s" % (jobParams["cmd"], ps_hosts, worker_hosts, i, worker_files[i]) for i in range(worker_num) ] for i in range(ps_num): os.system("mkdir -p %s" % ps_files[i]) ps_files[i] = os.path.join(ps_files[i], "run_dist_job.sh") with open(ps_files[i], 'w') as f: f.write(ps_cmd[i] + "\n") f.close() if "userId" in jobParams: os.system("chown -R %s %s" % (jobParams["userId"], ps_files[i])) remotecmd = "cp %s %s:/opt/run_dist_job.sh" % (ps_files[i], ps_pod_names[i]) k8sUtils.kubectl_exec(remotecmd) k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" % ps_pod_names[i]) for i in range(worker_num): os.system("mkdir -p %s" % worker_files[i]) worker_files[i] = os.path.join(worker_files[i], "run_dist_job.sh") with open(worker_files[i], 'w') as f: f.write(worker_cmd[i] + "\n") f.close() if "userId" in jobParams: os.system("chown -R %s %s" % (jobParams["userId"], worker_files[i])) remotecmd = "cp %s %s:/opt/run_dist_job.sh" % ( worker_files[i], worker_pod_names[i]) k8sUtils.kubectl_exec(remotecmd) k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" % worker_pod_names[i]) dataHandler = DataHandler() dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "running")
def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() logging.info("start to submit regular job...") try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] jobParams["pvc_work"] = "work-" + jobParams["jobId"] jobParams["pvc_data"] = "storage-" + jobParams["jobId"] if "jobPath" not in jobParams or len( jobParams["jobPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"], "ERROR: job-path does not exist") msg = "ERROR: job-path does not exist. jobid: %s" % ( jobParams["jobId"]) logging.error(msg) return False if "workPath" not in jobParams or len( jobParams["workPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"], "ERROR: work-path does not exist") msg = "ERROR: work-path does not exist. jobid: %s" % ( jobParams["jobId"]) logging.error(msg) return False #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") # return False jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in jobParams: mkdirsAsUser(localJobPath, jobParams["userId"]) mkdirsAsUser(os.path.join(localJobPath, "models"), jobParams["userId"]) else: mkdirsAsUser(localJobPath, "0") mkdirsAsUser(os.path.join(localJobPath, "models"), "0") jobParams["LaunchCMD"] = "" if "cmd" not in jobParams: jobParams["cmd"] = "" if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": launchScriptPath = os.path.join( localJobPath, "launch-%s.sh" % jobParams["jobId"]) with open(launchScriptPath, 'w') as f: f.write("#!/bin/bash -x\n") f.write(jobParams["cmd"] + "\n") msg = "write cmd(%s) to file: %s" % (jobParams["cmd"], launchScriptPath) logging.info(msg) f.close() if "userId" in jobParams: cmd = "chown -R %s %s" % (jobParams["userId"], launchScriptPath) os.system(cmd) logging.info(cmd) # todo: Pod启动后会执行shell脚本,需预先将shell脚本拷贝到Pod所在的节点机器的目录: # 譬如:/dlwsdata/work/user-nanme/jobs/191225/6f81459e-42ea-447e-9380-f545da2517e9/ # Pod启动后,会将此目录挂载至/job/ # jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"] jobParams[ "LaunchCMD"] = "[\"/bin/sh\", \"-ec\", \"sleep 6000315360000\"]" jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) jobParams["nvidiaDriverPath"] = nvidiaDriverPath jobParams["userNameLabel"] = getAlias(jobParams["userName"]) jobParams["rest-api"] = config["rest-api"] if "mountpoints" not in jobParams: jobParams["mountpoints"] = [] for onemount in jobParams["mountpoints"]: onemount["name"] = onemount["containerPath"].replace( "/", "").replace(".", "").replace("_", "-") # mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True} # if CheckMountPoints(jobParams["mountpoints"],mp): # jobParams["mountpoints"].append(mp) mp = { "name": "job", "containerPath": "/job", "hostPath": jobParams["hostjobPath"], "enabled": True } if CheckMountPoints(jobParams["mountpoints"], mp): jobParams["mountpoints"].append(mp) mp = { "name": "work", "containerPath": "/work", "hostPath": jobParams["hostworkPath"], "enabled": True } if CheckMountPoints(jobParams["mountpoints"], mp): jobParams["mountpoints"].append(mp) mp = { "name": "data", "containerPath": "/data", "hostPath": jobParams["hostdataPath"], "enabled": True } if CheckMountPoints(jobParams["mountpoints"], mp): jobParams["mountpoints"].append(mp) userAlias = getAlias(jobParams["userName"]) mp = { "name": "sshkey", "containerPath": "/home/%s/.ssh" % userAlias, "hostPath": os.path.join(config["storage-mount-path"], GetWorkPath(userAlias) + "/.ssh"), "readOnly": True, "enabled": True } # if CheckMountPoints(jobParams["mountpoints"], mp): jobParams["mountpoints"].append(mp) jobParams["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: jobParams["usefreeflow"] = config["usefreeflow"] else: jobParams["usefreeflow"] = False msg = ("Render Job: %s" % jobParams) print(msg) logging.info(msg) jobDescriptionList = [] pods = [] if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams: i = int(jobParams["hyperparameterstartvalue"]) end = int(jobParams["hyperparameterendvalue"]) step = int(jobParams["hyperparameterstep"]) c = 0 while (i <= end): pod = {} pod["podName"] = jobParams["jobId"] + "-pod-" + str(c) pod["envs"] = [{ "name": jobParams["hyperparametername"], "value": i }] i += step c += 1 pods.append(pod) else: pod = {} pod["podName"] = jobParams["jobId"] pod["envs"] = [] pods.append(pod) if "env" not in jobParams: jobParams["env"] = [] jobParams["commonenv"] = copy.copy(jobParams["env"]) for pod in pods: jobParams["podName"] = pod["podName"] jobParams["env"] = jobParams["commonenv"] + pod["envs"] if "kube_custom_scheduler" in config and config[ "kube_custom_scheduler"]: container = {} container["requests"] = { "alpha.gpu/numgpu": int(jobParams["resourcegpu"]) } podInfo = {} podInfo["podname"] = jobParams["podName"] if "useGPUTopology" in jobParams and jobParams[ "useGPUTopology"]: # add topology constraints explicitly - for testing # if (jobParams["resourcegpu"] >= 2): # # both cards in same inner group # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1 # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1 # if (jobParams["resourcegpu"] >= 3): # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1 # if (jobParams["resourcegpu"] >= 4): # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1 # if (jobParams["resourcegpu"] >= 5): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1 # if (jobParams["resourcegpu"] >= 6): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1 # if (jobParams["resourcegpu"] >= 7): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1 # if (jobParams["resourcegpu"] >= 8): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1 podInfo["requests"] = { "alpha.gpu/gpu-generate-topology": 1 } else: # for cases when desired topology is explictly given or not desired podInfo["requests"] = { "alpha.gpu/gpu-generate-topology": 0 } podInfo["runningcontainer"] = {jobParams["podName"]: container} if "annotations" not in jobParams: jobParams["annotations"] = {} jobParams["annotations"][ "pod.alpha/DeviceInformation"] = "'" + json.dumps( podInfo) + "'" jobParams[ "resourcegpu"] = 0 # gpu requests specified through annotation template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=jobParams) jobDescriptionList.append(job_description) if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0): ports = [ p.strip() for p in re.split(",|;", jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit() ] for portNum in ports: jobParams["serviceId"] = "interactive-" + jobParams[ "podName"] + "-" + portNum jobParams["port"] = portNum jobParams["port-name"] = "interactive" jobParams["port-type"] = "TCP" serviceTemplate = ENV.get_template( os.path.join(jobTempDir, "KubeSvc.yaml.template")) stemplate = ENV.get_template(serviceTemplate) interactiveMeta = stemplate.render(svc=jobParams) jobDescriptionList.append(interactiveMeta) jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) logging.info("kubectl create " + jobDescriptionPath + " output: " + str(output)) msg = "Submitted job %s to k8s, returned with status %s" % ( jobParams["jobId"], output) logging.info(msg) msg = "JobParams: \n" + json.dumps(jobParams) logging.info(msg) ## 启动命令非空 if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": ## 等待docker启动完毕,再执行文件拷贝指令 time.sleep(15) launch_file_name = "launch-%s.sh" % jobParams["jobId"] # 将文件拷贝进podName:/tmp/ # /job/目录需要root权限才能操作,因此此处无法直接拷贝进/job/ remotecmd = "cp %s %s:%s" % (launchScriptPath, jobParams["podName"], "/tmp/") output = k8sUtils.kubectl_exec(remotecmd) logging.info("remotecmd[" + remotecmd + "]" + " output[" + str(output) + "]") # 添加执行权限:/tmp/lunach_jobid.sh remotecmd = "exec %s -- bash -c \"chmod 777 /tmp/%s\"" % ( jobParams["jobId"], launch_file_name) output = k8sUtils.kubectl_exec(remotecmd) logging.info("remotecmd[" + remotecmd + "]" + " output[" + str(output) + "]") # 执行/tmp/lunach_jobid.sh remotecmd = "exec %s -- bash -c \"/tmp/%s\"" % (jobParams["jobId"], launch_file_name) output = k8sUtils.kubectl_exec(remotecmd) logging.info("remotecmd[" + remotecmd + "]" + " output[" + str(output) + "]") else: pass ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) msg = "update job text field %s, returned with status" % ( jobParams["jobId"]) logging.info(msg) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) return ret