def showNodeMetrics(): # Get usage metrics from raw API call to /apis/metrics.k8s.io/v1beta1/nodes api_client = client.ApiClient() raw_resp = api_client.call_api('/apis/metrics.k8s.io/v1beta1/nodes/', 'GET', _preload_content=False) # Crazy conversion required, dunno why this is so hard response_metrics = json.loads(raw_resp[0].data.decode('utf-8')) # Call list Nodes nodes = v1.list_node() count = 0 for node in nodes.items: if count > 2: break # Get name and cpu/mem capacity name = node.metadata.name mem_capacity = utils.parse_quantity(node.status.allocatable["memory"]) cpu_capacity = utils.parse_quantity(node.status.allocatable["cpu"]) # Search node metrics we grabbed before keyed on node name node_metrics = next(n for n in response_metrics["items"] if n["metadata"]["name"] == name) mem_usage = utils.parse_quantity(node_metrics["usage"]["memory"]) cpu_usage = utils.parse_quantity(node_metrics["usage"]["cpu"]) cpu_perc = round((cpu_usage/cpu_capacity)*100) mem_perc = round((mem_usage/mem_capacity)*100) lcd.set_cursor_position(7, count) lcd.write(f"{cpu_perc: 3}%") lcd.set_cursor_position(12, count) lcd.write(f"{mem_perc: 3}%") count = count + 1
def get_kubernetes_node_info_from_API(): config.load_kube_config() api_instance = client.CoreV1Api() # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CoreV1Api.md#list_node pretty = 'true' timeout_seconds = 56 ret = dict() try: api_response = api_instance.list_node(pretty=pretty, timeout_seconds=timeout_seconds) for node in api_response.items: ret[node.metadata.name] = { "cpu-resource": int(parse_quantity(node.status.allocatable['cpu'])), "mem-resource": int( parse_quantity(node.status.allocatable['memory']) / 1024 / 1024), "gpu-resource": int(parse_quantity(node.status.allocatable['nvidia.com/gpu'])), } except ApiException as e: logger.error("Exception when calling CoreV1Api->list_node: %s\n" % e) return ret
def test_parse(self): self.assertIsInstance(parse_quantity(2.2), Decimal) # input, expected output tests = [ (0, 0), (2, 2), (2, Decimal("2")), (2., 2), (Decimal("2.2"), Decimal("2.2")), (2., Decimal(2)), (Decimal("2."), 2), ("123", 123), ("2", 2), ("2n", Decimal("2") * Decimal(1000)**-3), ("2u", Decimal("0.000002")), ("2m", Decimal("0.002")), ("0m", Decimal("0")), ("0M", Decimal("0")), ("223k", 223000), ("002M", 2 * 1000**2), ("2M", 2 * 1000**2), ("4123G", 4123 * 1000**3), ("2T", 2 * 1000**4), ("2P", 2 * 1000**5), ("2E", 2 * 1000**6), ("223Ki", 223 * 1024), ("002Mi", 2 * 1024**2), ("2Mi", 2 * 1024**2), ("2Gi", 2 * 1024**3), ("4123Gi", 4123 * 1024**3), ("2Ti", 2 * 1024**4), ("2Pi", 2 * 1024**5), ("2Ei", 2 * 1024**6), ("2.34n", Decimal("2.34") * Decimal(1000)**-3), ("2.34u", Decimal("2.34") * Decimal(1000)**-2), ("2.34m", Decimal("2.34") * Decimal(1000)**-1), ("2.34Ki", Decimal("2.34") * 1024), ("2.34", Decimal("2.34")), (".34", Decimal("0.34")), ("34.", 34), (".34M", Decimal("0.34") * 1000**2), ("2e2K", Decimal("2e2") * 1000), ("2e2Ki", Decimal("2e2") * 1024), ("2e-2Ki", Decimal("2e-2") * 1024), ("2.34E1", Decimal("2.34E1")), (".34e-2", Decimal("0.34e-2")), ] for inp, out in tests: self.assertEqual(parse_quantity(inp), out) if isinstance(inp, (int, float, Decimal)): self.assertEqual(parse_quantity(-1 * inp), -out) else: self.assertEqual(parse_quantity("-" + inp), -out) self.assertEqual(parse_quantity("+" + inp), out)
def get_pod_requests(pod): ret = { "cpu-resource": 0, "mem-resource": 0, } for container in pod.spec.containers: if container.resources.requests is None: continue ret["cpu-resource"] += parse_quantity( container.resources.requests.get("cpu", 0)) ret["mem-resource"] += parse_quantity( container.resources.requests.get("memory", 0)) / 1024 / 1024 return ret
def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', '--layout', dest="layout", required=True, help="layout.yaml") parser.add_argument('-c', '--config', dest="config", required=True, help="cluster configuration") parser.add_argument('-o', '--output', dest="output", required=True, help="cluster configuration") args = parser.parse_args() output_path = os.path.expanduser(args.output) layout = load_yaml_config(args.layout) config = load_yaml_config(args.config) masters, workers = get_masters_workers_from_layout(layout) head_node = masters[0] # fill in cpu, memory, computing_device information in both masters and workers # we assume the layout file the user gives is correct all_machines = masters + workers for machine in all_machines: sku_info = layout['machine-sku'][machine['machine-type']] # use math.ceil to guarantee the memory volume # e.g. if use set 999.1MB, we ensure there is 1000MB to avoid scheduling issues machine['memory_mb'] = math.ceil( parse_quantity(sku_info['mem']) / 1024 / 1024) machine['cpu_vcores'] = sku_info['cpu']['vcore'] if 'computing-device' in sku_info: machine['computing_device'] = sku_info['computing-device'] # add machine to different comupting device group computing_device_groups = defaultdict(list) for machine in all_machines: sku_info = layout['machine-sku'][machine['machine-type']] if 'computing-device' in sku_info: computing_device_groups[sku_info['computing-device'] ['type']].append(machine['hostname']) environment = { 'masters': masters, 'workers': workers, 'cfg': config, 'head_node': head_node, 'computing_device_groups': computing_device_groups, } map_table = {"env": environment} generate_template_file("quick-start/pre-check.yml.template", "{0}/pre-check.yml".format(output_path), map_table)
def get_prophet_daemon_resource_request(cfg): ret = {"cpu-resource": 0, "mem-resource": 0} if "qos-switch" not in cfg or cfg["qos-switch"] == "false": logger.info( "Ignore calculate prophet daemon resource usage since qos-switch set to false" ) return ret prophet_daemon_services = ["node-exporter", "job-exporter", "log-manager"] prophet_source_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "../../../src") # {%- if cluster_cfg['cluster']['common']['qos-switch'] == "true" %} start_match = r"{%-?\s*if\s*cluster_cfg\['cluster'\]\['common'\]\['qos-switch'\][^}]+%}" end_match = r"{%-?\s*endif\s*%}" # {%- end %} str_match = "{}(.*?){}".format(start_match, end_match) regex = re.compile(str_match, flags=re.DOTALL) for prophet_daemon in prophet_daemon_services: deploy_template_path = os.path.join( prophet_source_path, "{0}/deploy/{0}.yaml.template".format(prophet_daemon)) if os.path.exists(deploy_template_path): template = read_template(deploy_template_path) match = regex.search(template) if not match: logger.warning( "Could not find resource request for service %s", prophet_daemon) continue resources = yaml.load(match.group(1), yaml.SafeLoader)["resources"] if "requests" in resources: ret["cpu-resource"] += parse_quantity( resources["requests"].get("cpu", 0)) ret["mem-resource"] += parse_quantity( resources["requests"].get("memory", 0)) / 1024 / 1024 elif "limits" in resources: ret["cpu-resource"] += parse_quantity(resources["limits"].get( "cpu", 0)) ret["mem-resource"] += parse_quantity(resources["limits"].get( "memory", 0)) / 1024 / 1024 else: logger.warning("Could not find resource request for PAI daemon %s", prophet_daemon) return ret
def k8s_container_resource_requirements(container) -> Dict[str, int]: """ returns dict in format: {"req_cpu": 1, "req_mem": 1, "lim_cpu": 2, "lim_mem": 2} """ try: cpu = 0 memory = 0 if not container.resources: return { "req_cpu": cpu, "req_mem": memory, "lim_cpu": cpu, "lim_mem": memory } limits = container.resources.limits lim_cpu = 0 lim_mem = 0 if limits and limits.get('cpu'): cpu = parse_quantity(limits['cpu']) lim_cpu = cpu if limits and limits.get('memory'): memory = parse_quantity(limits['memory']) lim_mem = memory requests = container.resources.requests req_cpu = 0 req_mem = 0 if requests and requests.get('cpu'): cpu = parse_quantity(requests['cpu']) req_cpu = max(cpu, cpu) if requests and requests.get('memory'): memory = parse_quantity(requests['memory']) req_mem = max(memory, memory) return { "req_cpu": req_cpu, "req_mem": req_mem, "lim_cpu": lim_cpu, "lim_mem": lim_mem } except Exception: logger.exception('Error getting resource requirements for container')
def get_k8s_cluster_info(working_dir, dns_prefix, location): kube_config_path = "{0}/_output/{1}/kubeconfig/kubeconfig.{2}.json".format( working_dir, dns_prefix, location) master_string = "opmaster" worker_string = "opworker" config.load_kube_config(config_file=kube_config_path) api_instance = client.CoreV1Api() pretty = 'true' timeout_seconds = 56 master = dict() worker = dict() sku = None gpu_enable = False master_ip = None master_ip_internal = None worker_count = 0 worker_with_gpu = 0 try: api_response = api_instance.list_node(pretty=pretty, timeout_seconds=timeout_seconds) for node in api_response.items: gpu_resource = 0 if 'nvidia.com/gpu' in node.status.allocatable: gpu_resource = int( parse_quantity(node.status.allocatable['nvidia.com/gpu'])) if master_string in node.metadata.name: master[node.metadata.name] = { "cpu-resource": int(parse_quantity(node.status.allocatable['cpu'])) - 2, "mem-resource": int( parse_quantity(node.status.allocatable['memory']) / 1024 / 1024) - 8 * 1024, "gpu-resource": gpu_resource, } master[node.metadata.name]["hostname"] = node.metadata.name for address in node.status.addresses: if address.type == "Hostname": continue if master_ip == None: master_ip = address.address if address.type == "ExternalIP": master_ip = address.address if address.type == "InternalIP": master[node.metadata.name]["ip"] = address.address master_ip_internal = address.address elif worker_string in node.metadata.name: worker[node.metadata.name] = { "cpu-resource": int(parse_quantity(node.status.allocatable['cpu'])) - 2, "mem-resource": int( parse_quantity(node.status.allocatable['memory']) / 1024 / 1024) - 8 * 1024, "gpu-resource": gpu_resource, } if sku is None: sku = dict() if gpu_resource != 0: sku["gpu_resource"] = worker[ node.metadata.name]["gpu-resource"] sku["mem-unit"] = int( worker[node.metadata.name]["mem-resource"] / worker[node.metadata.name]["gpu-resource"]) sku["cpu-unit"] = int( worker[node.metadata.name]["cpu-resource"] / worker[node.metadata.name]["gpu-resource"]) else: sku["cpu_resource"] = worker[ node.metadata.name]["cpu-resource"] sku["mem-unit"] = int( worker[node.metadata.name]["mem-resource"] / worker[node.metadata.name]["cpu-resource"]) worker_count = worker_count + 1 if worker[node.metadata.name]["gpu-resource"] != 0: worker_with_gpu = worker_with_gpu + 1 gpu_enable = True worker[node.metadata.name]["hostname"] = node.metadata.name for address in node.status.addresses: if address.type == "Hostname": continue if address.type == "InternalIP": worker[node.metadata.name]["ip"] = address.address except ApiException as e: logger.error("Exception when calling CoreV1Api->list_node: %s\n" % e) return { "master": master, "worker": worker, "sku": sku, "gpu": gpu_enable, "gpu-ready": worker_count == worker_with_gpu, "master_ip": master_ip, "master_internal": master_ip_internal, "working_dir": "{0}/{1}".format(working_dir, TEMPORARY_DIR_NAME), "kube_config": "{0}/_output/{1}/kubeconfig/kubeconfig.{2}.json".format( working_dir, dns_prefix, location) }
def parse_kubernetes_value(val: str) -> str: return str(parse_quantity(val))
api_client = client.ApiClient() raw_resp = api_client.call_api('/apis/metrics.k8s.io/v1beta1/nodes/', 'GET', _preload_content=False) # Crazy conversion required response_metrics = json.loads(raw_resp[0].data.decode('utf-8')) # Call list Nodes nodes = v1.list_node() node_index = 0 unicorn.off() for node in nodes.items: # Get name and cpu/mem capacity name = node.metadata.name mem_capacity = utils.parse_quantity(node.status.allocatable["memory"]) cpu_capacity = utils.parse_quantity(node.status.allocatable["cpu"]) # Search node metrics we grabbed before keyed on node name node_metrics = next(n for n in response_metrics["items"] if n["metadata"]["name"] == name) mem_usage = utils.parse_quantity(node_metrics["usage"]["memory"]) cpu_usage = utils.parse_quantity(node_metrics["usage"]["cpu"]) cpu_led_max = round((cpu_usage / cpu_capacity) * width) mem_led_max = round((mem_usage / mem_capacity) * width) drawBarRed(cpu_led_max, node_index) drawBarGreen(mem_led_max, node_index + 1) # Skip three lines, so leaves a gap # NOTE! Only works with three nodes!! node_index = node_index + 3
def parse_cpu_quantity(q): if q is None: return None return parse_quantity(q) * 1000
def parse_memory_quantity(q): if q is None: return None return parse_quantity(q) / MEBIBYTE