def get_cluster_status(): cluster_status = {} gpuStr = "alpha.kubernetes.io/nvidia-gpu" try: output = k8sUtils.kubectl_exec(" get nodes -o yaml") nodeInfo = yaml.load(output) nodes_status = {} user_status = {} if "items" in nodeInfo: for node in nodeInfo["items"]: node_status = {} node_status["name"] = node["metadata"]["name"] node_status["labels"] = node["metadata"]["labels"] if (gpuStr in node["status"]["allocatable"]): node_status["gpu_allocatable"] = int( node["status"]["allocatable"][gpuStr]) else: node_status["gpu_allocatable"] = 0 if (gpuStr in node["status"]["capacity"]): node_status["gpu_capacity"] = int( node["status"]["capacity"][gpuStr]) else: node_status["gpu_capacity"] = 0 node_status["gpu_used"] = 0 node_status["InternalIP"] = "unknown" node_status["pods"] = [] if "addresses" in node["status"]: for addr in node["status"]["addresses"]: if addr["type"] == "InternalIP": node_status["InternalIP"] = addr["address"] node_status["scheduled_service"] = [] for l, s in node_status["labels"].iteritems(): if s == "active" and l != "all" and l != "default": node_status["scheduled_service"].append(l) if "unschedulable" in node["spec"] and node["spec"][ "unschedulable"]: node_status["unschedulable"] = True else: node_status["unschedulable"] = False if "status" in node and "conditions" in node["status"]: for condi in node["status"]: if "type" in condi and condi[ "type"] == "Ready" and "status" in condi and condi[ "status"] == "Unknown": node_status["unschedulable"] = True nodes_status[node_status["name"]] = node_status output = k8sUtils.kubectl_exec(" get pods -o yaml") podsInfo = yaml.load(output) if "items" in podsInfo: for pod in podsInfo["items"]: gpus = 0 username = None if "metadata" in pod and "labels" in pod[ "metadata"] and "userName" in pod["metadata"]["labels"]: username = pod["metadata"]["labels"]["userName"] if "spec" in pod and "nodeName" in pod["spec"]: node_name = pod["spec"]["nodeName"] pod_name = pod["metadata"]["name"] if username is not None: pod_name += " : " + username gpuUsage = get_job_gpu_usage(pod["metadata"]["name"]) if gpuUsage is not None: pod_name += " (gpu usage:" + str(gpuUsage) + "%)" if gpuUsage <= 25: pod_name += "!!!!!!" if "containers" in pod["spec"]: for container in pod["spec"]["containers"]: if "resources" in container and "requests" in container[ "resources"] and gpuStr in container[ "resources"]["requests"]: gpus += int( container["resources"]["requests"][gpuStr]) pod_name += " (gpu #:" + container[ "resources"]["requests"][gpuStr] + ")" if node_name in nodes_status: nodes_status[node_name]["gpu_used"] += gpus nodes_status[node_name]["pods"].append(pod_name) if username is not None: if username not in user_status: user_status[username] = gpus else: user_status[username] += gpus gpu_avaliable = 0 gpu_reserved = 0 gpu_capacity = 0 gpu_unschedulable = 0 gpu_schedulable = 0 gpu_used = 0 for node_name, node_status in nodes_status.iteritems(): if node_status["unschedulable"]: gpu_unschedulable += node_status["gpu_capacity"] else: gpu_avaliable += (node_status["gpu_allocatable"] - node_status["gpu_used"]) gpu_schedulable += node_status["gpu_capacity"] gpu_unschedulable += (node_status["gpu_capacity"] - node_status["gpu_allocatable"]) gpu_reserved += (node_status["gpu_capacity"] - node_status["gpu_allocatable"]) gpu_used += node_status["gpu_used"] gpu_capacity += node_status["gpu_capacity"] cluster_status["user_status"] = [] for user_name, user_gpu in user_status.iteritems(): cluster_status["user_status"].append({ "userName": user_name, "userGPU": user_gpu }) cluster_status["gpu_avaliable"] = gpu_avaliable cluster_status["gpu_capacity"] = gpu_capacity cluster_status["gpu_unschedulable"] = gpu_unschedulable cluster_status["gpu_used"] = gpu_used cluster_status["gpu_reserved"] = gpu_reserved cluster_status["node_status"] = [ node_status for node_name, node_status in nodes_status.iteritems() ] except Exception as e: print e dataHandler = DataHandler() cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount() cluster_status["TotalJobNum"] = dataHandler.GetALLJobsCount() if "cluster_status" in config and check_cluster_status_change( config["cluster_status"], cluster_status): logging.info("updating the cluster status...") dataHandler.UpdateClusterStatus(cluster_status) else: logging.info( "nothing changed in cluster, skipping the cluster status update..." ) config["cluster_status"] = copy.deepcopy(cluster_status) dataHandler.Close() return cluster_status
def get_cluster_status(): cluster_status = {} gpuStr = "nvidia.com/gpu" try: output = k8sUtils.kubectl_exec(" get nodes -o yaml") nodeInfo = yaml.load(output) nodes_status = {} user_status = {} if "items" in nodeInfo: for node in nodeInfo["items"]: node_status = {} node_status["name"] = node["metadata"]["name"] node_status["labels"] = node["metadata"]["labels"] node_status["gpuType"] = "" node_status["scheduled_service"] = [] for l, s in node_status["labels"].iteritems(): if s == "active" and l != "all" and l != "default": node_status["scheduled_service"].append(l) if l == "gpuType": node_status["scheduled_service"].append(s) node_status["gpuType"] = s if (gpuStr in node["status"]["allocatable"]): node_status["gpu_allocatable"] = ResourceInfo({ node_status["gpuType"]: int(node["status"]["allocatable"][gpuStr]) }).ToSerializable() else: node_status["gpu_allocatable"] = ResourceInfo( ).ToSerializable() if (gpuStr in node["status"]["capacity"]): node_status["gpu_capacity"] = ResourceInfo({ node_status["gpuType"]: int(node["status"]["capacity"][gpuStr]) }).ToSerializable() else: node_status["gpu_capacity"] = ResourceInfo( ).ToSerializable() node_status["gpu_used"] = ResourceInfo().ToSerializable() node_status["InternalIP"] = "unknown" node_status["pods"] = [] if "annotations" in node["metadata"]: if "node.alpha/DeviceInformation" in node["metadata"][ "annotations"]: node_info = json.loads( node["metadata"]["annotations"] ["node.alpha/DeviceInformation"]) if (int(node_info["capacity"]["alpha.gpu/numgpu"]) > ResourceInfo( node_status["gpu_capacity"]).TotalCount()): node_status["gpu_capacity"] = ResourceInfo({ node_status["gpuType"]: int(node_info["capacity"]["alpha.gpu/numgpu"]) }).ToSerializable() if (int(node_info["allocatable"]["alpha.gpu/numgpu"]) > ResourceInfo(node_status["gpu_allocatable"] ).TotalCount()): node_status["gpu_allocatable"] = ResourceInfo({ node_status["gpuType"]: int(node_info["allocatable"] ["alpha.gpu/numgpu"]) }).ToSerializable() if "addresses" in node["status"]: for addr in node["status"]["addresses"]: if addr["type"] == "InternalIP": node_status["InternalIP"] = addr["address"] if "unschedulable" in node["spec"] and node["spec"][ "unschedulable"]: node_status["unschedulable"] = True else: node_status["unschedulable"] = False if "status" in node and "conditions" in node["status"]: for condi in node["status"]["conditions"]: if "type" in condi and condi[ "type"] == "Ready" and "status" in condi and condi[ "status"] == "Unknown": node_status["unschedulable"] = True nodes_status[node_status["name"]] = node_status output = k8sUtils.kubectl_exec(" get pods -o yaml") podsInfo = yaml.load(output) if "items" in podsInfo: for pod in podsInfo["items"]: gpus = 0 username = None if "metadata" in pod and "labels" in pod[ "metadata"] and "userName" in pod["metadata"]["labels"]: username = pod["metadata"]["labels"]["userName"] if "spec" in pod and "nodeName" in pod["spec"]: node_name = pod["spec"]["nodeName"] pod_name = pod["metadata"]["name"] if username is not None: pod_name += " : " + username gpuUsage = get_job_gpu_usage(pod["metadata"]["name"]) if gpuUsage is not None: pod_name += " (gpu usage:" + str(gpuUsage) + "%)" if gpuUsage <= 25: pod_name += "!!!!!!" pod_info_cont = {} pod_info_initcont = {} if "annotations" in pod["metadata"]: if "pod.alpha/DeviceInformation" in pod["metadata"][ "annotations"]: pod_info = json.loads( pod["metadata"]["annotations"] ["pod.alpha/DeviceInformation"]) if "runningcontainer" in pod_info: pod_info_cont = pod_info["runningcontainer"] if "initcontainer" in pod_info: pod_info_initcont = pod_info["initcontainer"] if "containers" in pod["spec"]: for container in pod["spec"]["containers"]: containerGPUs = 0 if "resources" in container and "requests" in container[ "resources"] and gpuStr in container[ "resources"]["requests"]: containerGPUs = int( container["resources"]["requests"][gpuStr]) if container["name"] in pod_info_cont: if "requests" in pod_info_cont[container[ "name"]] and "alpha.gpu/numgpu" in pod_info_cont[ container["name"]]["requests"]: containerGPUs = max( int(pod_info_cont[container["name"]] ["requests"]["alpha.gpu/numgpu"]), containerGPUs) gpus += containerGPUs pod_name += " (gpu #:" + str(containerGPUs) + ")" if node_name in nodes_status: nodes_status[node_name]["gpu_used"] = ResourceInfo( nodes_status[node_name]["gpu_used"]).Add( ResourceInfo( {nodes_status[node_name]["gpuType"]: gpus})).ToSerializable() nodes_status[node_name]["pods"].append(pod_name) if username is not None: if username not in user_status: user_status[username] = ResourceInfo( {nodes_status[node_name]["gpuType"]: gpus}) else: user_status[username].Add( ResourceInfo({ nodes_status[node_name]["gpuType"]: gpus })) gpu_avaliable = ResourceInfo() gpu_reserved = ResourceInfo() gpu_capacity = ResourceInfo() gpu_unschedulable = ResourceInfo() gpu_schedulable = ResourceInfo() gpu_used = ResourceInfo() for node_name, node_status in nodes_status.iteritems(): if node_status["unschedulable"]: gpu_unschedulable.Add(ResourceInfo( node_status["gpu_capacity"])) gpu_reserved.Add( ResourceInfo.Difference( ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_used"]))) else: gpu_avaliable.Add( ResourceInfo.Difference( ResourceInfo(node_status["gpu_allocatable"]), ResourceInfo(node_status["gpu_used"]))) gpu_schedulable.Add(ResourceInfo(node_status["gpu_capacity"])) gpu_unschedulable.Add( ResourceInfo.Difference( ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) gpu_reserved.Add( ResourceInfo.Difference( ResourceInfo(node_status["gpu_capacity"]), ResourceInfo(node_status["gpu_allocatable"]))) gpu_used.Add(ResourceInfo(node_status["gpu_used"])) gpu_capacity.Add(ResourceInfo(node_status["gpu_capacity"])) cluster_status["user_status"] = [] for user_name, user_gpu in user_status.iteritems(): cluster_status["user_status"].append({ "userName": user_name, "userGPU": user_gpu.ToSerializable() }) cluster_status["gpu_avaliable"] = gpu_avaliable.ToSerializable() cluster_status["gpu_capacity"] = gpu_capacity.ToSerializable() cluster_status["gpu_unschedulable"] = gpu_unschedulable.ToSerializable( ) cluster_status["gpu_used"] = gpu_used.ToSerializable() cluster_status["gpu_reserved"] = gpu_reserved.ToSerializable() cluster_status["node_status"] = [ node_status for node_name, node_status in nodes_status.iteritems() ] except Exception as e: logging.exception("get cluster status") dataHandler = DataHandler() cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount() if "cluster_status" in config and check_cluster_status_change( config["cluster_status"], cluster_status): logging.info("updating the cluster status...") dataHandler.UpdateClusterStatus(cluster_status) else: logging.info( "nothing changed in cluster, skipping the cluster status update..." ) config["cluster_status"] = copy.deepcopy(cluster_status) dataHandler.Close() return cluster_status