def __get_gpu_info(self): def parse_unit(val, scale=1000): unit_ls = ['B', 'KB', 'MB', 'GB'] unit_lv = 0 while val >= scale: val /= scale unit_lv += 1 if unit_lv == len(unit_ls) - 1: break return '{:.2f} {}'.format(val, unit_ls[unit_lv]) sum_info = [] process_ls = [] nv.nvmlInit() gpu_num = nv.nvmlDeviceGetCount() # 遍历每块卡 for gpu_idx in range(gpu_num): h = nv.nvmlDeviceGetHandleByIndex(gpu_idx) dev_name = nv.nvmlDeviceGetName(h).decode() raw_total_mem = nv.nvmlDeviceGetMemoryInfo(h).total total_mem = parse_unit(raw_total_mem, 1024) raw_used_mem = nv.nvmlDeviceGetMemoryInfo(h).used used_mem = parse_unit(raw_used_mem, 1024) gpu_util = '{:.2f}'.format(nv.nvmlDeviceGetUtilizationRates(h).gpu) gpu_mem_util = '{:.2f}'.format(raw_used_mem * 100 / raw_total_mem) tmp = {} tmp['gpu_idx'] = str(gpu_idx) tmp['dev_name'] = dev_name tmp['total_mem'] = total_mem tmp['used_mem'] = used_mem tmp['gpu_util'] = gpu_util tmp['gpu_mem_util'] = gpu_mem_util sum_info.append(tmp) running_process_obj_ls = nv.nvmlDeviceGetComputeRunningProcesses(h) for obj in running_process_obj_ls: process_pid = obj.pid process_type = 'C' process_raw_gpu_mem = obj.usedGpuMemory process_name = nv.nvmlSystemGetProcessName( process_pid).decode() ctan_name = self.get_ctan_name_by_pid(process_pid) tmp = {} tmp['gpu_idx'] = str(gpu_idx) tmp['dev_name'] = dev_name tmp['process_pid'] = str(process_pid) tmp['process_type'] = process_type tmp['process_name'] = process_name tmp['process_gpu_mem'] = parse_unit(process_raw_gpu_mem, 1024) tmp['ctan_name'] = ctan_name process_ls.append(tmp) running_process_obj_ls = nv.nvmlDeviceGetGraphicsRunningProcesses( h) for obj in running_process_obj_ls: process_pid = obj.pid process_type = 'G' process_raw_gpu_mem = obj.usedGpuMemory process_name = nv.nvmlSystemGetProcessName( process_pid).decode() ctan_name = self.get_ctan_name_by_pid(process_pid) tmp = {} tmp['gpu_idx'] = str(gpu_idx) tmp['dev_name'] = dev_name tmp['process_pid'] = str(process_pid) tmp['process_type'] = process_type tmp['process_name'] = process_name tmp['process_gpu_mem'] = parse_unit(process_raw_gpu_mem, 1024) tmp['ctan_name'] = ctan_name process_ls.append(tmp) return sum_info, process_ls
def __get_ctan_verbose_stats(self, name): # 连续获得参数 def graceful_chain_get(d, *args, default=None): t = d for a in args: try: t = t[a] except (KeyError, ValueError, TypeError, AttributeError): return default return t # 计算cpu使用占比 def calculate_cpu_percent2(d, previous_cpu_total=None, previous_cpu_system=None): cpu_percent = 0.0 cpu_total = float(d["cpu_stats"]["cpu_usage"]["total_usage"]) if previous_cpu_total is None: previous_cpu_total = cpu_total cpu_delta = cpu_total - previous_cpu_total cpu_system = float(d["cpu_stats"]["system_cpu_usage"]) if previous_cpu_system is None: previous_cpu_system = cpu_system system_delta = cpu_system - previous_cpu_system online_cpus = d["cpu_stats"].get( "online_cpus", len(d["cpu_stats"]["cpu_usage"]["percpu_usage"])) if system_delta > 0.0: cpu_percent = (cpu_delta / system_delta) * online_cpus * 100.0 return cpu_percent, cpu_total, cpu_system # 计算IO def calculate_blkio_bytes(d): """ :param d: :return: (read_bytes, wrote_bytes), ints """ bytes_stats = graceful_chain_get(d, "blkio_stats", "io_service_bytes_recursive") if not bytes_stats: return 0, 0 r = 0 w = 0 for s in bytes_stats: if s["op"] == "Read": r += s["value"] elif s["op"] == "Write": w += s["value"] return r, w # 计算网络 def calculate_network_bytes(d): """ :param d: :return: (received_bytes, transceived_bytes), ints """ networks = graceful_chain_get(d, "networks") if not networks: return 0, 0 r = 0 t = 0 for if_name, data in networks.items(): r += data["rx_bytes"] t += data["tx_bytes"] return r, t def calculate_mem_bytes(d): mem_limit = d['memory_stats']['limit'] mem_usage = d['memory_stats']['usage'] return mem_usage, mem_limit def parse_unit(val, scale=1000): unit_ls = ['B', 'KB', 'MB', 'GB'] unit_lv = 0 while val >= scale: val /= scale unit_lv += 1 if unit_lv == len(unit_ls) - 1: break return '{:.2f} {}'.format(val, unit_ls[unit_lv]) if name not in self.user_stats_stream: # print('add {} into user_stats_stream'.format(name)) ctan = self.containers.get(name) self.user_stats_stream[name] = ctan.stats(decode=True) # 通过数据流获取信息 if self.containers.get(name).status == 'running': raw_stats = self.user_stats_stream[name].__next__() pre_cpu_stats = self.pre_cpu_stats[name] else: return None # cpu cpu_percent, cpu_total, cpu_system = calculate_cpu_percent2( raw_stats, pre_cpu_stats[0], pre_cpu_stats[1]) self.pre_cpu_stats[name] = [cpu_total, cpu_system] # 更新usage # blk read_blk, write_blk = calculate_blkio_bytes(raw_stats) # net read_net, write_net = calculate_network_bytes(raw_stats) # mem mem_usage, mem_limit = calculate_mem_bytes(raw_stats) # user gpu gpu_all_mem, gpu_used_mem, gpu_used_pcnt = 0, 0, 0 gpu_num = nv.nvmlDeviceGetCount() for gpu_idx in range(gpu_num): h = nv.nvmlDeviceGetHandleByIndex(gpu_idx) running_process_obj_ls = nv.nvmlDeviceGetComputeRunningProcesses(h) for obj in running_process_obj_ls: process_pid = obj.pid process_raw_gpu_mem = obj.usedGpuMemory ctan_name = self.get_ctan_name_by_pid(process_pid) if ctan_name == name: gpu_used_mem += process_raw_gpu_mem gpu_all_mem += nv.nvmlDeviceGetMemoryInfo(h).total ret_dt = { 'id': raw_stats['id'], 'pid': str(raw_stats['pids_stats']['current']), 'cpu_percent': '{:.2f}'.format(cpu_percent), 'read_blk': parse_unit(read_blk), 'write_blk': parse_unit(write_blk), 'read_net': parse_unit(read_net), 'write_net': parse_unit(write_net), 'mem_usage': parse_unit(mem_usage, scale=1024), 'mem_limit': parse_unit(mem_limit, scale=1024), 'mem_usage_pcnt': '{:.2f}'.format(mem_usage / mem_limit * 100), 'gpu_mem_usage': parse_unit(gpu_used_mem, 1024), 'gpu_mem_limit': parse_unit(gpu_all_mem, 1024), 'gpu_mem_usage_pcnt': '{:.2f}'.format(gpu_used_mem / gpu_all_mem * 100) } return ret_dt