def cb(): src_dict = {} src_dict["pci-tx"] = [pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES)/1024 for i in range(ngpus)] src_dict["pci-rx"] = [pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES)/1024 for i in range(ngpus)] source.data.update(src_dict)
def cb(): nonlocal last_time now = time.time() src_dict = {"time": [now * 1000]} gpu_tot = 0 mem_tot = 0 tx_tot = 0 rx_tot = 0 for i in range(ngpus): gpu = pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handles[i]).used tx = (pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) * 1024) rx = (pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) * 1024) gpu_tot += gpu mem_tot += mem / (1024 * 1024) rx_tot += rx tx_tot += tx src_dict["gpu-" + str(i)] = [gpu] src_dict["memory-" + str(i)] = [mem] src_dict["gpu-total"] = [gpu_tot / ngpus] src_dict["memory-total"] = [(mem_tot / gpu_mem_sum) * 100] src_dict["tx-total"] = [tx_tot] src_dict["rx-total"] = [rx_tot] source.stream(src_dict, 1000) last_time = now
def test_nvmlDeviceGetPcieThroughput(ngpus, handles): for i in range(ngpus): tx_bytes_tp = pynvml.nvmlDeviceGetPcieThroughput( handles[i], NVML_PCIE_UTIL_TX_BYTES) assert tx_bytes_tp >= 0 rx_bytes_tp = pynvml.nvmlDeviceGetPcieThroughput( handles[i], NVML_PCIE_UTIL_RX_BYTES) assert rx_bytes_tp >= 0 count_tp = pynvml.nvmlDeviceGetPcieThroughput(handles[i], NVML_PCIE_UTIL_COUNT) assert count_tp >= 0
def pci(doc): tx_fig = figure(title="TX Bytes [MB/s]", sizing_mode="stretch_both", y_range=[0, 5000]) pci_tx = [pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES)/1024 for i in range(ngpus)] left = list(range(len(pci_tx))) right = [l + 0.8 for l in left] source = ColumnDataSource({"left": left, "right": right, "pci-tx": pci_tx}) mapper = LinearColorMapper( palette=all_palettes['RdYlBu'][4], low=0, high=5000) tx_fig.quad( source=source, left="left", right="right", bottom=0, top="pci-tx", color={"field": "pci-tx", "transform": mapper} ) rx_fig = figure(title="RX Bytes [MB/s]", sizing_mode="stretch_both", y_range=[0, 5000]) pci_rx = [pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES)/1024 for i in range(ngpus)] left = list(range(len(pci_rx))) right = [l + 0.8 for l in left] source = ColumnDataSource({"left": left, "right": right, "pci-rx": pci_rx}) mapper = LinearColorMapper( palette=all_palettes['RdYlBu'][4], low=0, high=5000) rx_fig.quad( source=source, left="left", right="right", bottom=0, top="pci-rx", color={"field": "pci-rx", "transform": mapper} ) doc.title = "PCI Throughput" doc.add_root( column(tx_fig, rx_fig, sizing_mode="stretch_both") ) def cb(): src_dict = {} src_dict["pci-tx"] = [pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES)/1024 for i in range(ngpus)] src_dict["pci-rx"] = [pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES)/1024 for i in range(ngpus)] source.data.update(src_dict) doc.add_periodic_callback(cb, 200)
def pci(doc): # Use device-0 to get "upper bound" pci_gen = pynvml.nvmlDeviceGetMaxPcieLinkGeneration(gpu_handles[0]) pci_width = pynvml.nvmlDeviceGetMaxPcieLinkWidth(gpu_handles[0]) pci_bw = { # Keys = PCIe-Generation, Values = Max PCIe Lane BW (per direction) # [Note: Using specs at https://en.wikipedia.org/wiki/PCI_Express] 1: (250.0 / 1024.0), 2: (500.0 / 1024.0), 3: (985.0 / 1024.0), 4: (1969.0 / 1024.0), 5: (3938.0 / 1024.0), 6: (7877.0 / 1024.0), } # Max PCIe Throughput = (BW-per-lane / Width) max_rxtx_tp = pci_width * pci_bw[pci_gen] pci_tx = [ pynvml.nvmlDeviceGetPcieThroughput(gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) / (1024.0 * 1024.0) # Convert KB/s -> GB/s for i in range(ngpus) ] pci_rx = [ pynvml.nvmlDeviceGetPcieThroughput(gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) / (1024.0 * 1024.0) # Convert KB/s -> GB/s for i in range(ngpus) ] left = list(range(ngpus)) right = [l + 0.8 for l in left] source = ColumnDataSource({ "left": left, "right": right, "pci-tx": pci_tx, "pci-rx": pci_rx }) mapper = LinearColorMapper(palette=all_palettes["RdYlBu"][4], low=0, high=max_rxtx_tp) tx_fig = figure(title="TX Bytes [GB/s]", sizing_mode="stretch_both", y_range=[0, max_rxtx_tp]) tx_fig.quad( source=source, left="left", right="right", bottom=0, top="pci-tx", color={ "field": "pci-tx", "transform": mapper }, ) tx_fig.toolbar_location = None rx_fig = figure(title="RX Bytes [GB/s]", sizing_mode="stretch_both", y_range=[0, max_rxtx_tp]) rx_fig.quad( source=source, left="left", right="right", bottom=0, top="pci-rx", color={ "field": "pci-rx", "transform": mapper }, ) rx_fig.toolbar_location = None doc.title = "PCI Throughput" doc.add_root(column(tx_fig, rx_fig, sizing_mode="stretch_both")) def cb(): src_dict = {} src_dict["pci-tx"] = [ pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) / (1024.0 * 1024.0) # Convert KB/s -> GB/s for i in range(ngpus) ] src_dict["pci-rx"] = [ pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) / (1024.0 * 1024.0) # Convert KB/s -> GB/s for i in range(ngpus) ] source.data.update(src_dict) doc.add_periodic_callback(cb, 200)
def _get_data(self): data = {} if self.deviceCount: for i in range(self.deviceCount): gpuIdx = str(i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) brand = pynvml.nvmlDeviceGetBrand(handle) brands = ['Unknown', 'Quadro', 'Tesla', 'NVS', 'Grid', 'GeForce', 'Titan'] ### Get data ### ## Memory usage try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) except Exception as e: self.debug(str(e)) mem = None ## ECC errors try: _memError = {} _eccCounter = {} eccErrors = {} eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC'] memErrorType = ['ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED'] memoryLocationType = ['L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY'] for memoryLocation in range(5): for eccCounter in range(2): for memError in range(2): _memError[memErrorType[memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(handle,memError,eccCounter,memoryLocation) _eccCounter[eccCounterType[eccCounter]] = _memError eccErrors[memoryLocationType[memoryLocation]] = _eccCounter except Exception as e: self.debug(str(e)) eccErrors = None ## Temperature try: temp = pynvml.nvmlDeviceGetTemperature(handle,pynvml.NVML_TEMPERATURE_GPU) except Exception as e: self.debug(str(e)) temp = None ## Fan try: fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception as e: self.debug(str(e)) fanspeed = None ## Power try: power = pynvml.nvmlDeviceGetPowerUsage(handle) except Exception as e: self.debug(str(e)) power = None ## GPU and Memory Utilization try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_util = util.gpu mem_util = util.memory except Exception as e: self.debug(str(e)) gpu_util = None mem_util = None ## PCI Express Bandwidth Utilization try: pcie_tx = pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_TX_BYTES) pcie_rx = pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_RX_BYTES) except Exception as e: self.debug(str(e)) pcie_tx = None pcie_rx = None ### Packing data ### self.debug("Device", gpuIdx, ":", str(name)) data["device_name_" + gpuIdx] = name self.debug("Brand:", str(brands[brand])) self.debug(str(name), "Temp :", str(temp)) data["device_temp_" + gpuIdx] = temp self.debug(str(name), "Mem total :", str(mem.total), 'bytes') data["device_mem_total_" + gpuIdx] = mem.total self.debug(str(name), "Mem used :", str(mem.used), 'bytes') data["device_mem_used_" + gpuIdx] = mem.used self.debug(str(name), "Mem free :", str(mem.free), 'bytes') data["device_mem_free_" + gpuIdx] = mem.free self.debug(str(name), "Utilization GPU :", str(gpu_util), '%') data["device_util_gpu_" + gpuIdx] = gpu_util self.debug(str(name), "Utilization MEM :", str(mem_util), '%') data["device_util_mem_" + gpuIdx] = mem_util self.debug(str(name), "Utilization PCIE TX :", str(pcie_tx), '%') data["device_util_pcie_tx_" + gpuIdx] = pcie_tx self.debug(str(name), "Utilization PCIE RX :", str(pcie_rx), '%') data["device_util_pcie_rx_" + gpuIdx] = pcie_rx self.debug(str(name), "Fan speed :", str(fanspeed), '%') data["device_fanspeed_" + gpuIdx] = fanspeed self.debug(str(name), "Power Usage :", str(power), 'Watt') data["device_power_" + gpuIdx] = power self.debug(str(name), "ECC errors:", str(eccErrors)) if eccErrors is not None: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] else: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None ## Get unit (S-class Nvidia cards) data if self.unitCount: for i in range(self.unitCount): gpuIdx = str(i) handle = pynvml.nvmlUnitGetHandleByIndex(i) try: fan = pynvml.nvmlUnitGetFanSpeedInfo(handle) fan_speed = fan.speed # Fan speed (RPM) fan_state = fan.state # Flag that indicates whether fan is working properly except Exception as e: self.debug(str(e)) fan_speed = None fan_state = None try: psu = pynvml.nvmlUnitGetPsuInfo(handle) psu_current = psu.current # PSU current (A) psu_power = psu.power # PSU power draw (W) psu_state = psu.state # The power supply state psu_voltage = psu.voltage # PSU voltage (V) except Exception as e: self.debug(str(e)) psu_current = None psu_power = None psu_state = None psu_voltage = None try: temp_intake = pynvml.nvmlUnitGetTemperature(handle,0) # Temperature at intake in C temp_exhaust = pynvml.nvmlUnitGetTemperature(handle,1) # Temperature at exhaust in C temp_board = pynvml.nvmlUnitGetTemperature(handle,2) # Temperature on board in C except Exception as e: self.debug(str(e)) temp_intake = None temp_exhaust = None temp_board = None self.debug('Unit fan speed:',str(fan_speed)) data["unit_fan_speed_" + gpuIdx] = fan_speed self.debug('Unit fan state:',str(fan_state)) data["unit_fan_state_" + gpuIdx] = fan_state self.debug('Unit PSU current:',str(psu_current)) data["unit_psu_current_" + gpuIdx] = psu_current self.debug('Unit PSU power:', str(psu_power)) data["unit_psu_power_" + gpuIdx] = psu_power self.debug('Unit PSU state:', str(psu_state)) data["unit_psu_state_" + gpuIdx] = psu_state self.debug('Unit PSU voltage:', str(psu_voltage)) data["unit_psu_voltage_" + gpuIdx] = psu_voltage self.debug('Unit temp intake:', str(temp_intake)) data["unit_temp_intake_" + gpuIdx] = temp_intake self.debug('Unit temp exhaust:', str(temp_exhaust)) data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust self.debug('Unit temp board:', str(temp_board)) data["unit_temp_board_" + gpuIdx] = temp_board ## Get data via legacy mode if self.legacy: try: output, error = Popen( [ "nvidia-settings", "-c", ":0", "-q", "GPUUtilization", "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q", "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory" ], shell=False, stdout=PIPE,stderr=PIPE).communicate() output = repr(str(output)) if len(output) < 800: raise Exception('Error in fetching data from nvidia-settings ' + output) self.debug(str(error), output) except Exception as e: self.error(str(e)) self.error('Setting legacy mode to False') self.legacy = False return data for i in range(self.deviceCount): gpuIdx = str(i) if data["device_temp_" + gpuIdx] is None: coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_temp_" + gpuIdx] = int(coreTemp) self.debug('Using legacy temp for GPU {0}: {1}'.format(gpuIdx, coreTemp)) except Exception as e: self.debug(str(e), "skipping device_temp_" + gpuIdx) if data["device_mem_used_" + gpuIdx] is None: memUsed = findall('UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_mem_used_" + gpuIdx] = int(memUsed) self.debug('Using legacy mem_used for GPU {0}: {1}'.format(gpuIdx, memUsed)) except Exception as e: self.debug(str(e), "skipping device_mem_used_" + gpuIdx) if data["device_util_gpu_" + gpuIdx] is None: gpu_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1] try: data["device_util_gpu_" + gpuIdx] = int(gpu_util) self.debug('Using legacy load_gpu for GPU {0}: {1}'.format(gpuIdx, gpu_util)) except Exception as e: self.debug(str(e), "skipping device_util_gpu_" + gpuIdx) if data["device_util_mem_" + gpuIdx] is None: mem_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2] try: data["device_util_mem_" + gpuIdx] = int(mem_util) self.debug('Using legacy load_mem for GPU {0}: {1}'.format(gpuIdx, mem_util)) except Exception as e: self.debug(str(e), "skipping device_util_mem_" + gpuIdx) return data
def log_system(log_file, process_pids=None): """ Logs system utilization metrics to log file """ # log cpu util cpu_util = psutil.cpu_percent() cpu_util_ind = psutil.cpu_percent(percpu=True) ts = time.time() key = "INFO" message = "CPU util: {}% -- Individual utils 1-24: {}".format( cpu_util, cpu_util_ind[:24]) write_to_log(log_file, (ts, key, message)) message = "CPU util: {}% -- Individual utils 25-48: {}".format( cpu_util, cpu_util_ind[24:]) write_to_log(log_file, (ts, key, message)) # log GPU util and memory try: max_gpu_util = 0 deviceCount = pynvml.nvmlDeviceGetCount() for idx in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(idx) board_num = pynvml.nvmlDeviceGetBoardId(handle) name = "GPU {}: {} (ID {})".format( idx, pynvml.nvmlDeviceGetName(handle).decode("utf-8"), board_num) util = pynvml.nvmlDeviceGetUtilizationRates(handle) fan_util = pynvml.nvmlDeviceGetFanSpeed(handle) pcie_counter = pynvml.nvmlDeviceGetPcieReplayCounter(handle) pcie_util = pynvml.nvmlDeviceGetPcieThroughput( handle, pcie_counter) gpu_util = util.gpu mem_util = util.memory message = "{}: Kernel:{}% Mem:{}% Fan:{}% PCIe: {}MB/s".format( name, gpu_util, mem_util, fan_util, round(pcie_util / 1000, 1)) ts = time.time() key = "INFO" write_to_log(log_file, (ts, key, message)) if gpu_util > max_gpu_util: max_gpu_util = gpu_util except pynvml.NVMLError as error: print(error) # log memory util mem_util = psutil.virtual_memory() used = round(mem_util.used / 1e+9, 2) total = round(mem_util.total / 1e+9, 2) ts = time.time() key = "INFO" message = "Memory util: {}% ({}/{}GB)".format( round(used / total * 100, 2), used, total) write_to_log(log_file, (ts, key, message)) pid_statuses = [] warning = False if process_pids is not None: for key in process_pids: pid = process_pids[key] try: os.kill(pid, 0) RUNNING = "running" except OSError: RUNNING = "stopped" warning = True pid_statuses.append("{} ({}): {}\n".format(key, pid, RUNNING)) ts = time.time() key = "INFO" if warning: key = "WARNING" write_to_log(log_file, (ts, key, pid_statuses)) last_log_time = time.time() return last_log_time, max_gpu_util