def get(index): try: handle = pynvml.nvmlDeviceGetHandleByIndex(index) except pynvml.NVMLError_GpuIsLost: return None memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) return dict( nvmlDeviceGetName=pynvml.nvmlDeviceGetName(handle).decode('utf-8'), nvmlDeviceGetMemoryInfo=dict( total=memory_info.total, free=memory_info.free, used=memory_info.used, ), nvmlDeviceGetUtilizationRates=get_utilization_rates(handle), nvmlDeviceGetFanSpeed=get_fan_speed(handle), nvmlDeviceGetTemperature=pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU), nvmlDeviceGetTemperatureThreshold=dict( slowdown=pynvml.nvmlDeviceGetTemperatureThreshold( handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN), shutdown=pynvml.nvmlDeviceGetTemperatureThreshold( handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN), ), nvmlDeviceGetPowerManagementLimit=pynvml. nvmlDeviceGetPowerManagementLimit(handle), nvmlDeviceGetPowerUsage=pynvml.nvmlDeviceGetPowerUsage(handle), )
def collect_metrics(self): """ Collect NVIDIA GPU metrics (eg: Temperature, Power-Consumption, fan-speed, etc.) """ data_list = [] for gpu_num in range(nvmlDeviceGetCount()): handle = nvmlDeviceGetHandleByIndex(gpu_num) device_name = DEVICE_NAME_FORMAT % gpu_num power_usage = float(nvmlDeviceGetPowerUsage(handle)) / 1000.0 fan_speed = nvmlDeviceGetFanSpeed(handle) temperature = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) data_list.append({ 'measurement': device_name, 'tags': { 'host': 'minar', 'gpu': device_name }, 'fields': { 'power_usage': power_usage, 'fan_speed': fan_speed, 'temperature': temperature } }) time.sleep(PERIOD_SECS) return data_list
def _get_gpu_usage(gpu_count): import pynvml gpus = [] for i in range(gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) try: power_usage = ( pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0) / (pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0) * 100 except pynvml.NVMLError as e: logger.error( "Coudln't extract power usage due to NVML exception: {}". format(str(e))) power_usage = -9999 gpus.append( (handle, util.gpu, util.memory, (memory.used / float(memory.total)) * 100, temp, power_usage)) except pynvml.NVMLError as e: logger.error( "Coudln't extract gpu usage information due to NVML exception: {}" .format(str(e))) return None return gpus
def temperatures(): ret = {} for i in range(nv.nvmlDeviceGetCount()): hdl = nv.nvmlDeviceGetHandleByIndex(i) temp = nv.nvmlDeviceGetTemperature(hdl, nv.NVML_TEMPERATURE_GPU) ret[i] = temp return ret
def device_status(device_index): handle = nv.nvmlDeviceGetHandleByIndex(device_index) device_name = nv.nvmlDeviceGetName(handle) device_name = device_name.decode('UTF-8') nv_procs = nv.nvmlDeviceGetComputeRunningProcesses(handle) utilization = nv.nvmlDeviceGetUtilizationRates(handle).gpu clock_mhz = nv.nvmlDeviceGetClockInfo(handle, nv.NVML_CLOCK_SM) temperature = nv.nvmlDeviceGetTemperature(handle, nv.NVML_TEMPERATURE_GPU) pids = [] users = [] dates = [] cmd = None for nv_proc in nv_procs: pid = nv_proc.pid pids.append(pid) try: proc = psutil.Process(pid) users.append(proc.username()) dates.append(proc.create_time()) if cmd is None: cmd = parse_cmd_roughly(proc.cmdline()) except psutil.NoSuchProcess: users.append('?') return { 'type': device_name, 'is_available': len(pids) == 0, 'pids': ','.join([str(pid) for pid in pids]), 'users': ','.join(users), 'running_since': arrow.get(min(dates)).humanize() if len(dates) > 0 else None, 'utilization': utilization, 'clock_mhz': clock_mhz, 'temperature': temperature, 'cmd': cmd, }
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def getTemp(handle): try: temp = str( pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)) except pynvml.NVMLError as err: temp = handleError(err) PUSH_TO_CW = False return temp
def get_temperature(handle): temp = -1 try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) except Exception: pass return temp
def get_nvidia_gpu_temper(self, handle, sensor=0): ''' 获得英伟达GPU的温度 :param handle: :param sensor: :return: ''' temper = pynvml.nvmlDeviceGetTemperature(handle, sensor) return temper
def get_gpu_temperatures(): nvmlInit() gpus = dict() for i in range(nvmlDeviceGetCount()): handle = nvmlDeviceGetHandleByIndex(i) gpus[i] = int(nvmlDeviceGetTemperature(handle, 0)) nvmlShutdown() return gpus
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': { 'gpu': util_gpu, 'memory': util_mem }, 'memory': { 'total': mem_total, 'free': mem_free, 'used': mem_used }, 'temperature': temperature, 'power': { 'draw': power_draw, 'limit': power_limit } } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def get_gpu_temp(self): try: gpu_handle = nvmlDeviceGetHandleByIndex(0) temp_data = nvmlDeviceGetTemperature(gpu_handle, 0) if temp_data: return temp_data except Exception: self.__logger.error("SensorsThread.get_cpu_temp Exception", exc_info=True) return None
def stats(self): stats = {} for i in range(0, self.gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) in_use_by_us = gpu_in_use_by_this_process(handle) stats["gpu.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.{}.{}".format(i, "memory")] = util.memory stats["gpu.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.{}.{}".format(i, "temp")] = temp if in_use_by_us: stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.process.{}.{}".format(i, "memory")] = util.memory stats["gpu.process.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.process.{}.{}".format(i, "temp")] = temp # Some GPUs don't provide information about power usage try: power_watts = pynvml.nvmlDeviceGetPowerUsage( handle) / 1000.0 power_capacity_watts = pynvml.nvmlDeviceGetEnforcedPowerLimit( handle) / 1000.0 power_usage = (power_watts / power_capacity_watts) * 100 stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage if in_use_by_us: stats["gpu.process.{}.{}".format( i, "powerWatts")] = power_watts stats["gpu.process.{}.{}".format( i, "powerPercent")] = power_usage except pynvml.NVMLError as err: pass except pynvml.NVMLError as err: pass if psutil: #net = psutil.net_io_counters() sysmem = psutil.virtual_memory() stats["cpu"] = psutil.cpu_percent() stats["memory"] = sysmem.percent return stats
def get_gpu_status(gpu_index=0): # init for getting N.nvmlInit() handle = N.nvmlDeviceGetHandleByIndex(gpu_index) def _decode(b): if isinstance(b, bytes): return b.decode() # to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None try: memory = N.nvmlDeviceGetMemoryInfo(handle) except N.NVMLError: memory = None try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None # real gpu index index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature': temperature, 'utilization': utilization.gpu if utilization else None, 'power': int(power / 1000) if power is not None else None, 'enforced.power': int(power_limit / 1000) if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, } # release resource N.nvmlShutdown() return GPUStat(gpu_info)
def gpu_info(gpu_handle, i: int = 0) -> List[Dict[str, Any]]: power = pynvml.nvmlDeviceGetPowerUsage(gpu_handle) / 1000 temperature = pynvml.nvmlDeviceGetTemperature(gpu_handle, pynvml.NVML_TEMPERATURE_GPU) free_memory = best_prefix(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).free) return [ dict(full_text=f'GPU Power {power:.1f} W', name=f'gpu{i}_power'), dict(full_text=free_memory.format('GPU RAM {value:.1f} {unit}'), name=f'gpu{i}_free_memory'), dict(full_text=f'GPU Temp {temperature} ℃', name=f'gpu{i}_temperature'), ]
def get_heat_realtime(free_gpus): free_gpus_temp = [] N.nvmlInit() for index in free_gpus: handle = N.nvmlDeviceGetHandleByIndex(index) try: temperature = float( N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)) except N.NVMLError: temperature = float('inf') free_gpus_temp += [temperature] return free_gpus_temp
def get_temperature(self): if self.nvh == None: return None t = None try: t = nv.nvmlDeviceGetTemperature(self.nvh, nv.NVML_TEMPERATURE_GPU) self.temperature = t except: logger.error( f"{self.pci_dev.slot_name}/{self.name}] get temperature failed !!" ) return t
def report(self): self.loss_report /= self.update_decay_steps self.acc_report /= self.update_decay_steps for acc_step, loss_step in zip(self.acc_report.tolist(), self.loss_report.tolist()): self.stats.train_record(acc_step, loss_step) infos = list(nvmlDeviceGetMemoryInfo(handler) for handler in self.gpu_info_handler) tems = list(nvmlDeviceGetTemperature(handler, 0) for handler in self.gpu_info_handler) info_used = sum(info.used for info in infos) info_total = sum(info.total for info in infos) output_str = str.format( 'Step: %6d, acc:%6.2f (%6.2f~%6.2f), loss:%5.2f (%5.2f~%5.2f), ' 'lr: %.4f, bc: %d/%d, bs: %5d, tks: %6d+%6d, t: %5.2f, m: %5.2f/%5.2f, tem: %sC' % (self.processed_steps, self.acc_report.mean() * 100, self.acc_report.min() * 100, self.acc_report.max() * 100, self.loss_report.mean(), self.loss_report.min(), self.loss_report.max(), self.lr, self.src_tokens.sum() + self.tgt_tokens.sum(), self.src_num_pad_tokens.sum() + self.tgt_num_pad_tokens.sum(), self.num_examples.sum(), self.src_tokens.sum(), self.tgt_tokens.sum(), self.time_sum, info_used / self.memory_unit, info_total / self.memory_unit, '/'.join(str(x) for x in tems)) ) self.stats.log_to_file(output_str) print(output_str) self.acc_report.fill(0) self.loss_report.fill(0) self.update_decay_steps.fill(0) self.src_tokens.fill(0) self.tgt_tokens.fill(0) self.src_num_pad_tokens.fill(0) self.tgt_num_pad_tokens.fill(0) self.num_examples.fill(0) self.time_sum = 0.0 if max(info.used / info.total for info in infos) > self.gpu_memory_limit: torch.cuda.empty_cache() return
def get_frame(self): # Get CPU and GPU usage percentages self.cpu_usage_percent = psutil.cpu_percent() self.gpu_usage_percent = nvml.nvmlDeviceGetUtilizationRates( self.selected_gpu).gpu # Create a deep copy of template to work on frame = deepcopy(self.template) # Draw progress bar for each usage def getUsageLineColor(usage): if usage < 30.: return self.GREEN_COLOR elif usage < 70.: return self.YELLOW_COLOR else: return self.RED_COLOR cpu_line_length_px = int(self.cpu_usage_percent / 100. * self.MAX_LINE_LENGTH_PX) gpu_line_length_px = int(self.gpu_usage_percent / 100. * self.MAX_LINE_LENGTH_PX) frame[1:(cpu_line_length_px + 1), 0:self.LINE_WIDTH, :] = getUsageLineColor(self.cpu_usage_percent) frame[18:(gpu_line_length_px + 18), 0:self.LINE_WIDTH, :] = getUsageLineColor(self.gpu_usage_percent) # Color CPU and GPU text according to temperature def getCPUGPUTextColor(temp): if temp < 45.: return self.GREEN_COLOR elif temp < 74.: return self.YELLOW_COLOR else: return self.RED_COLOR self.cpu_temperature = self._get_cpu_temperature() self.gpu_temperature = nvml.nvmlDeviceGetTemperature( self.selected_gpu, nvml.NVML_TEMPERATURE_GPU) IMAGE_CHANNELS_AXIS = -1 # Last axis frame[(frame == self.CPU_TEXT_COLOR_VALUE).all( axis=IMAGE_CHANNELS_AXIS)] = getCPUGPUTextColor( self.cpu_temperature) frame[(frame == self.GPU_TEXT_COLOR_VALUE).all( axis=IMAGE_CHANNELS_AXIS)] = getCPUGPUTextColor( self.gpu_temperature) return frame.flatten()
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': {'gpu': util_gpu, 'memory': util_mem}, 'memory': {'total': mem_total, 'free': mem_free, 'used': mem_used}, 'temperature': temperature, 'power': {'draw': power_draw, 'limit': power_limit} } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def get_gpu_info(redis_con: redis.Redis, gpus_compat: List[int]) -> Tuple[List[dict], bool, bool]: """Returns information about GPUs :param redis_con: an instance of Redis connection :param gpus_compat: a list of GPUs that are compatible with TensorFlow :return: Tuple that contains [0] list of dictionaries with information about every GPU [1] boolean that states if the available GPUs for training changed [2] boolean that states if the available GPUs for inference changed """ if not GPU_MODE: return [], False, False gpu_data = [] gpu_ok_train = [] gpu_ok_inf = [] for i in range(pynvml.nvmlDeviceGetCount()): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu except pynvml.NVMLError: util = -1 try: mem_use = pynvml.nvmlDeviceGetMemoryInfo(handle).used except pynvml.NVMLError: mem_use = -1 try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) except pynvml.NVMLError: temp = -1 mem_tot = int(redis_con.lindex(RKEY_HWINFO_GPU_MEM_AVAIL, i)) if i in gpus_compat and mem_tot - mem_use >= int(os.environ['TRAINING_GPU_MIN_MEM']): gpu_ok_train.append(i) if i in gpus_compat and mem_tot - mem_use >= int(os.environ['INFERENCE_GPU_MIN_MEM']): gpu_ok_inf.append(i) gpu_data.append({ 'name': redis_con.lindex(RKEY_HWINFO_GPU_NAME, i), 'index': i, 'tf_compatibility': i in gpus_compat, 'utilization': util, 'memory': { 'total': mem_tot, 'used': mem_use }, 'temperature': temp }) gpus_train_change = db_update_available_gpus(redis_con, os.environ['REDIS_KEY_TRAINING_GPUS_OK'], gpu_ok_train) gpus_inf_change = db_update_available_gpus(redis_con, os.environ['REDIS_KEY_INFERENCE_GPUS_OK'], gpu_ok_inf) return gpu_data, gpus_train_change, gpus_inf_change
def run(self): while True: temp = str( pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU)) name = str(pynvml.nvmlDeviceGetName(self.handle)) meminfo = pynvml.nvmlDeviceGetMemoryInfo(self.handle) total = str(round(meminfo.total / 1024**3, 2)) used = str(round(meminfo.used / 1024**3, 2)) free = str(round(meminfo.free / 1024**3, 2)) data = [{ 'temp': temp, 'name': name, 'total': total, 'used': used, 'free': free }] self.update_data.emit(data) time.sleep(1)
def get_gpu_stat(handle): ret = {} # get temperature try: ret['temp'] = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except: ret['temp'] = None # get power usage try: ret['watt'] = N.nvmlDeviceGetPowerUsage(handle) / 1000 except: ret['watt'] = None ret['fan'] = 0 # return information gathered #print("temp: {0}, watt: {1}".format(ret['temp'], ret['watt'])) return ret
def collect_via_pynvml(self, stats_config): """ Use pynvml python binding to collect metrics :param stats_config: :return: """ try: NVML_TEMPERATURE_GPU = 0 pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() for device_index in xrange(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle) metrics = { 'memory.total': memoryInfo.total / 1024 / 1024, 'memory.used': memoryInfo.total / 1024 / 1024, 'memory.free': memoryInfo.free / 1024 / 1024, 'utilization.gpu': utilizationRates.gpu, 'utilization.memory': utilizationRates.memory, 'temperature.gpu': pynvml.nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) } for stat_name in stats_config[1:]: metric = metrics.get(stat_name) if metric: metric_name = 'gpu_{index}.{stat_name}'.format( index=str(device_index), stat_name=stat_name) self.publish(metric_name, metric) finally: pynvml.nvmlShutdown()
def check(self, instance): try: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) mem = pynvml.nvmlDeviceGetMemoryInfo(handle) util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) # utilization info self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) # memory info self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) # temperature info self.gauge('nvml.temp.', temp, tags=d_tags) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) status = AgentCheck.OK msg = u'Ok' except: status = AgentCheck.CRITICAL msg = u'Error' finally: pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def GpuInfo(self): """ 获取GPU使用情况 :return: GPU使用情况 """ gpuInfoList = list() for gpuIndex, gpuHandle in enumerate(self.__gpuHandleList): gpuPercent = pynvml.nvmlDeviceGetUtilizationRates(gpuHandle).gpu memTotal = format( pynvml.nvmlDeviceGetMemoryInfo(gpuHandle).total / self.MB, ".2f") memUsed = format( pynvml.nvmlDeviceGetMemoryInfo(gpuHandle).used / self.MB, ".2f") temp = pynvml.nvmlDeviceGetTemperature(gpuHandle, pynvml.NVML_TEMPERATURE_GPU) gpuInfoList.append( dict(Index=gpuIndex, Percent=gpuPercent, TotalMem=memTotal, UsedMem=memUsed, Temp=temp)) return gpuInfoList
def collect_via_pynvml(self, stats_config): """ Use pynvml python binding to collect metrics :param stats_config: :return: """ try: NVML_TEMPERATURE_GPU = 0 pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() for device_index in xrange(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle) metrics = { 'memory.total': memoryInfo.total / 1024 / 1024, 'memory.used': memoryInfo.total / 1024 / 1024, 'memory.free': memoryInfo.free / 1024 / 1024, 'utilization.gpu': utilizationRates.gpu, 'utilization.memory': utilizationRates.memory, 'temperature.gpu': pynvml.nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) } for stat_name in stats_config[1:]: metric = metrics.get(stat_name) if metric: metric_name = 'gpu_{index}.{stat_name}'.format( index=str(device_index), stat_name=stat_name ) self.publish(metric_name, metric) finally: pynvml.nvmlShutdown()
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def get_gpu_pid_info(): """Retrieves the process IDs of processes running on the GPU.""" gpus = [] device_count = -1 try: nvmlInit() device_count = nvmlDeviceGetCount() gpus = [{}] * device_count for i in range(device_count): gpus[i] = {'id': i} handle = nvmlDeviceGetHandleByIndex(i) device_name = nvmlDeviceGetName(handle) gpus[i]['name'] = device_name try: util = nvmlDeviceGetUtilizationRates(handle) gpus[i]['utilization'] = util.gpu except NVMLError as err: print(f'Error while reading GPU utilization for GPU {i}: {err}', file=sys.stderr) try: mem_info = nvmlDeviceGetMemoryInfo(handle) gpus[i]['mem_total'] = mem_info.total gpus[i]['mem_used'] = mem_info.used except NVMLError as err: print(f'Error while reading memory utilization for GPU {i}: {err}', file=sys.stderr) try: fan_speed = nvmlDeviceGetFanSpeed(handle) gpus[i]['fan_speed'] = fan_speed except NVMLError as err: print(f'Error while reading fan speed for GPU {i}: {err}', file=sys.stderr) try: temp = nvmlDeviceGetTemperature(handle, 0) gpus[i]['temp'] = temp except NVMLError as err: print(f'Error while reading temperature for GPU {i}: {err}', file=sys.stderr) try: power_usage = nvmlDeviceGetPowerUsage(handle) gpus[i]['power_usage'] = round(power_usage / 1000.) except NVMLError as err: print(f'Error while reading power usage for GPU {i}: {err}', file=sys.stderr) try: power_limit = nvmlDeviceGetEnforcedPowerLimit(handle) gpus[i]['power_limit'] = round(power_limit / 1000.) except NVMLError as err: print(f'Error while reading power limit for GPU {i}: {err}', file=sys.stderr) gpus[i]['processes'] = [] try: processes = nvmlDeviceGetComputeRunningProcesses(handle) for process in processes: process_name = nvmlSystemGetProcessName(process.pid).decode() gpus[i]['processes'].append({'pid': process.pid, 'name': process_name}) except NVMLError as err: print(f'Error while reading processes for GPU {i}: {err}', file=sys.stderr) except NVMLError as err: print(f'Error while reading GPU information: {err}', file=sys.stderr) nvmlShutdown() return gpus, device_count
def _get_full_status_nvml(): devices_status = [] devices_full_status = [] for handle in _static_info['private']['gpu']['handles']: util = pynvml.nvmlDeviceGetUtilizationRates(handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) process_info = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) devices_status.append({ 'utilization': { 'gpu': util.gpu, 'memory': util.memory }, 'memory': { 'percent': int(1000.0 * mem_info.used / mem_info.total) / 10.0 }, 'processes': len(process_info) }) with _process_info_lock: process_list = [] for p in process_info: info = _process_info[p.pid] info['gpu_memory'] = p.usedGpuMemory process_list.append(info) process_list.sort(key=lambda i: i['gpu_memory'] or 0, reverse=True) full_status = { 'memory': { 'free': mem_info.free, 'used': mem_info.used }, 'process_list': process_list } try: full_status['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported: pass try: full_status['temperature'] = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) except pynvml.NVMLError_NotSupported: pass try: full_status['performance'] = pynvml.nvmlDeviceGetPerformanceState( handle) except pynvml.NVMLError_NotSupported: pass try: full_status['power'] = { 'usage': pynvml.nvmlDeviceGetPowerUsage(handle), 'limit': pynvml.nvmlDeviceGetPowerManagementLimit(handle) } except pynvml.NVMLError_NotSupported: pass devices_full_status.append(full_status) status = { 'basic': { 'devices': devices_status }, 'full': { 'devices': devices_full_status } } return status
def check(self, instance): pynvml.nvmlInit() msg_list = [] gpus_in_use = 0 try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) gpus_in_use += 1 if util_rate.memory > 50.0 else 0 except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) self.gauge('nvml.process.count', len(cps), d_tags) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['pname'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags['puser'] = self.get_process_owner(ps.pid) docker_name, docker_image = self.get_container_name(ps.pid) p_tags['docker_image'] = docker_image p_tags['docker_name'] = docker_name p_tags = self._dict2list(p_tags) print p_tags self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) self.gauge('nvml.gpus_in_use_count', gpus_in_use) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def do_GET(self): #checks if the server is alive if self.path == '/test': send_header(self) self.wfile.write(bytes('passed<br>', 'utf-8')) self.wfile.write(bytes('server is responding', 'utf-8')) #returns the running processes if self.path == '/runningProcesses': send_header(self) #send response: if modules['psutil']: for proc in psutil.process_iter(): try: pinfo = proc.as_dict(attrs=['pid', 'name']) except psutil.NoSuchProcess: pass print(pinfo) self.wfile.write(bytes(str(pinfo), 'utf-8')) else: self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8') #returns the CPU utilization and number of cores elif self.path == '/cpuInfo': send_header(self) #get CPU info cpuInfo = {} if modules['psutil']: cpuInfo['CPU Utilization'] = int(psutil.cpu_percent()) cpuInfo['CPU Cores'] = int(psutil.cpu_count()) else: cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.' json_dump = json.dumps(cpuInfo) self.wfile.write(bytes(json_dump, 'utf-8')) #get GPU info if modules['pynvml']: try: pynvml.nvmlInit() gpus = pynvml.nvmlDeviceGetCount() except: gpus = 0 self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8')) else: gpus = 0 self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8')) for i in range(gpus): handle = pynvml.nvmlDeviceGetHandleByIndex(i) self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8')) try: self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '°C', 'utf-8')) except: self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8')) try: gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8')) self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8')) except: self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8')) if gpus > 0: try: pynvml.nvmlShutdown() except: pass elif self.path == '/availableComputers': send_header(self) s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('google.com', 0)) global myownsocket myownsocket = s.getsockname()[0] port = 8003 available_computers = [] for i in range(1, 256): host = '192.168.178.' + str(i) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(0.2) try: alive = sock.connect_ex((host, port)) except: alive = -1 if alive == 0: print('available') available_computers.append(host) else: print('not available') print(host) self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8')) cmd_txt = """@echo off call "C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat" echo ##### start_rendering xsibatch -render "Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn" -frames #1#-#2# -pass "BEAUTY" -skip on -verbose on echo ##### rendering_done """ self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8')) self.wfile.write(bytes('<table border="1">\n', 'utf-8')) self.wfile.write(bytes('<tr>\n', 'utf-8')) self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8')) self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8')) self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8')) self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8')) available_cpus = {} for host in available_computers: available_cpus[host] = abs(get_cpu_cores(host)) total_cpus = sum(available_cpus.values()) frame_list = {} start_frame = 0 for host in available_computers: start_frame += 1 frame_list[host] = [start_frame] start_frame = start_frame + int(100 * (available_cpus[host] / total_cpus)) if start_frame > 100: start_frame = 100 frame_list[host].append(start_frame) index = 0 for host in available_computers: index += 1 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) index = 2 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) self.wfile.write(bytes('</table>\n', 'utf-8')) self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8')) self.wfile.write(bytes('</form>\n', 'utf-8')) self.wfile.write(bytes('</body>\n', 'utf-8')) self.wfile.write(bytes('</html>\n', 'utf-8')) elif self.path == '/execute_job': send_header(self) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) elif '/submit_job' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) #print(parsed) print(parameters) self.wfile.write(bytes('<body>', 'utf-8')) for index in range(1, 100): if not parameters.get('host' + str(index)).strip(): pass elif not parameters.get('start' + str(index)).strip(): pass elif not parameters.get('end' + str(index)).strip(): pass elif parameters.get('command'): cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip()) cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip()) self.wfile.write(bytes(escape(cmd_txt), 'utf-8')) self.wfile.write(bytes('<br>', 'utf-8')) print(cmd_txt) self.wfile.write(bytes('</body></html>', 'utf-8')) elif '/shutdown' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("Server will be shut down now......", 'utf-8')) server.shutdown() sys.exit() else: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("<br>", 'utf-8')) self.wfile.write(bytes(self.path, 'utf-8')) print(self.path)
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def step(self): valuesDict = {} valuesDict['table'] = self._tableName cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0) mem = valuesDict['mem'] = psutil.virtual_memory().percent swap = valuesDict['swap'] = psutil.swap_memory().percent # some code examples: # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py if self.doGpu: for i in self.gpusToUse: try: handle = nvmlDeviceGetHandleByIndex(i) memInfo = nvmlDeviceGetMemoryInfo(handle) valuesDict["gpuMem_%d" % i] = \ float(memInfo.used)*100./float(memInfo.total) util = nvmlDeviceGetUtilizationRates(handle) valuesDict["gpuUse_%d" % i] = util.gpu temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) valuesDict["gpuTem_%d" % i] = temp except NVMLError as err: handle = nvmlDeviceGetHandleByIndex(i) msg = "Device %d -> %s not suported\n" \ "Remove device %d from FORM" % \ (i, nvmlDeviceGetName(handle), i) errorWindow(None, msg) if self.doNetwork: try: # measure a sort interval pnic_before = psutil.net_io_counters(pernic=True)[self.nif] time.sleep(self.samplingTime) # sec pnic_after = psutil.net_io_counters(pernic=True)[self.nif] bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv valuesDict["%s_send" % self.nif] = \ bytes_sent * self.samplingTime / 1048576 valuesDict["%s_recv" % self.nif] = \ bytes_recv * self.samplingTime / 1048576 except: msg = "cannot get information of network interface %s" % \ self.nif if self.doDiskIO: try: # measure a sort interval disk_before = psutil.disk_io_counters(perdisk=False) time.sleep(self.samplingTime) # sec disk_after = psutil.disk_io_counters(perdisk=False) bytes_read = disk_after.read_bytes - disk_before.read_bytes bytes_write = disk_after.write_bytes - disk_before.write_bytes valuesDict["disk_read"] = \ self.samplingTime * bytes_read / self.mega valuesDict["disk_write"] = \ self.samplingTime * bytes_write / self.mega except: msg = "cannot get information of disk usage " if self.cpuAlert < 100 and cpu > self.cpuAlert: self.warning("CPU allocation =%f." % cpu) self.cpuAlert = cpu if self.memAlert < 100 and mem.percent > self.memAlert: self.warning("Memory allocation =%f." % mem) self.memAlert = mem if self.swapAlert < 100 and swap.percent > self.swapAlert: self.warning("SWAP allocation =%f." % swap) self.swapAlert = swap sqlCommand = "INSERT INTO %(table)s (" for label in self.labelList: sqlCommand += "%s, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ") VALUES(" for label in self.labelList: sqlCommand += "%"+"(%s)f, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ");" sql = sqlCommand % valuesDict try: self.cur.execute(sql) except Exception as e: print("ERROR: saving one data point (monitor). I continue") # Return finished = True if all protocols have finished finished = [] for prot in self.protocols: updatedProt = getUpdatedProtocol(prot) finished.append(updatedProt.getStatus() != STATUS_RUNNING) return all(finished)
def info_refresh(self): try: stat = open("/proc/stat") self.statlines = stat.read().splitlines()[1:-1] stat.close() except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): for j in self.statlines[i].split()[1:]: #remove cpu# self.total[i]+= int(j) self.idle[i] = int(self.statlines[i].split()[4]) for i in range(self.corecount): if (self.total[i] - self.prev_total[i]) == 0: self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] break self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) ) self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] self.idle[i] = 0 self.total[i] = 0 for i in range(self.deviceCount): util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i]) temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU) memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i]) (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i]) (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i]) mem_total = memInfo.total / 1024 / 1024 mem_used = memInfo.used / 1024 / 1024 self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu) self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100) ######## self.util_history.append(util.gpu) self.util_graph.queue_draw() self.temp_history.append(temp) self.temp_graph.queue_draw() ######## self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory) self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100) self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util) self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util) self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100) self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100) self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total)) self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total) self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp) if temp > 100: temp = 100 elif temp < 0: temp = 0 self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100) #--proc-- procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0]) proc_liststore = Gtk.ListStore(int, str, int) for p in procs: pid = p.pid try: path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8') except: self.exit() if (p.usedGpuMemory == None): mem = 0 else: mem = (p.usedGpuMemory / 1024 / 1024) proc_liststore.append([pid, path, mem]) self.tree.set_model(proc_liststore) return True
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.info('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.info('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def _get_data(self): data = {} if self.deviceCount: for i in range(self.deviceCount): gpuIdx = str(i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) brand = pynvml.nvmlDeviceGetBrand(handle) ### Get data ### ## Memory usage try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) except Exception as e: self.debug(str(e)) mem = None ## ECC errors try: _memError = {} _eccCounter = {} eccErrors = {} eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC'] memErrorType = [ 'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED' ] memoryLocationType = [ 'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY' ] for memoryLocation in range(5): for eccCounter in range(2): for memError in range(2): _memError[memErrorType[ memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter( handle, memError, eccCounter, memoryLocation) _eccCounter[eccCounterType[eccCounter]] = _memError eccErrors[ memoryLocationType[memoryLocation]] = _eccCounter except Exception as e: self.debug(str(e)) eccErrors = None ## Temperature try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) except Exception as e: self.debug(str(e)) temp = None ## Fan try: fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception as e: self.debug(str(e)) fanspeed = None ## GPU and Memory Utilization try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_util = util.gpu mem_util = util.memory except Exception as e: self.debug(str(e)) gpu_util = None mem_util = None ## Encoder Utilization try: encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) enc_util = encoder[0] except Exception as e: self.debug(str(e)) enc_util = None ## Decoder Utilization try: decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) dec_util = decoder[0] except Exception as e: self.debug(str(e)) dec_util = None ## Clock frequencies try: clock_core = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_GRAPHICS) clock_sm = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_SM) clock_mem = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor except Exception as e: self.debug(str(e)) clock_core = None clock_sm = None clock_mem = None ### Packing data ### self.debug("Device", gpuIdx, ":", str(name)) data["device_name_" + gpuIdx] = name self.debug("Brand:", str(brand)) self.debug(str(name), "Temp :", str(temp)) data["device_temp_" + gpuIdx] = temp self.debug(str(name), "Mem total :", str(mem.total), 'bytes') data["device_mem_total_" + gpuIdx] = mem.total self.debug(str(name), "Mem used :", str(mem.used), 'bytes') data["device_mem_used_" + gpuIdx] = mem.used self.debug(str(name), "Mem free :", str(mem.free), 'bytes') data["device_mem_free_" + gpuIdx] = mem.free self.debug(str(name), "Load GPU :", str(gpu_util), '%') data["device_load_gpu_" + gpuIdx] = gpu_util self.debug(str(name), "Load MEM :", str(mem_util), '%') data["device_load_mem_" + gpuIdx] = mem_util self.debug(str(name), "Load ENC :", str(enc_util), '%') data["device_load_enc_" + gpuIdx] = enc_util self.debug(str(name), "Load DEC :", str(dec_util), '%') data["device_load_dec_" + gpuIdx] = dec_util self.debug(str(name), "Core clock:", str(clock_core), 'MHz') data["device_core_clock_" + gpuIdx] = clock_core self.debug(str(name), "SM clock :", str(clock_sm), 'MHz') data["device_sm_clock_" + gpuIdx] = clock_sm self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz') data["device_mem_clock_" + gpuIdx] = clock_mem self.debug(str(name), "Fan speed :", str(fanspeed), '%') data["device_fanspeed_" + gpuIdx] = fanspeed self.debug(str(name), "ECC errors:", str(eccErrors)) if eccErrors is not None: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] else: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None ## Get unit (S-class Nvidia cards) data if self.unitCount: for i in range(self.unitCount): gpuIdx = str(i) handle = pynvml.nvmlUnitGetHandleByIndex(i) try: fan = pynvml.nvmlUnitGetFanSpeedInfo(handle) fan_speed = fan.speed # Fan speed (RPM) fan_state = fan.state # Flag that indicates whether fan is working properly except Exception as e: self.debug(str(e)) fan_speed = None fan_state = None try: psu = pynvml.nvmlUnitGetPsuInfo(handle) psu_current = psu.current # PSU current (A) psu_power = psu.power # PSU power draw (W) psu_state = psu.state # The power supply state psu_voltage = psu.voltage # PSU voltage (V) except Exception as e: self.debug(str(e)) psu_current = None psu_power = None psu_state = None psu_voltage = None try: temp_intake = pynvml.nvmlUnitGetTemperature( handle, 0) # Temperature at intake in C temp_exhaust = pynvml.nvmlUnitGetTemperature( handle, 1) # Temperature at exhaust in C temp_board = pynvml.nvmlUnitGetTemperature( handle, 2) # Temperature on board in C except Exception as e: self.debug(str(e)) temp_intake = None temp_exhaust = None temp_board = None self.debug('Unit fan speed:', str(fan_speed)) data["unit_fan_speed_" + gpuIdx] = fan_speed self.debug('Unit fan state:', str(fan_state)) data["unit_fan_state_" + gpuIdx] = fan_state self.debug('Unit PSU current:', str(psu_current)) data["unit_psu_current_" + gpuIdx] = psu_current self.debug('Unit PSU power:', str(psu_power)) data["unit_psu_power_" + gpuIdx] = psu_power self.debug('Unit PSU state:', str(psu_state)) data["unit_psu_state_" + gpuIdx] = psu_state self.debug('Unit PSU voltage:', str(psu_voltage)) data["unit_psu_voltage_" + gpuIdx] = psu_voltage self.debug('Unit temp intake:', str(temp_intake)) data["unit_temp_intake_" + gpuIdx] = temp_intake self.debug('Unit temp exhaust:', str(temp_exhaust)) data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust self.debug('Unit temp board:', str(temp_board)) data["unit_temp_board_" + gpuIdx] = temp_board ## Get data via legacy mode if self.legacy: try: output, error = Popen([ "nvidia-settings", "-c", ":0", "-q", "GPUUtilization", "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q", "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory" ], shell=False, stdout=PIPE, stderr=PIPE).communicate() output = repr(str(output)) if len(output) < 800: raise Exception( 'Error in fetching data from nvidia-settings ' + output) self.debug(str(error), output) except Exception as e: self.error(str(e)) self.error('Setting legacy mode to False') self.legacy = False return data for i in range(self.deviceCount): gpuIdx = str(i) if data["device_temp_" + gpuIdx] is None: coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_temp_" + gpuIdx] = int(coreTemp) self.debug('Using legacy temp for GPU {0}: {1}'.format( gpuIdx, coreTemp)) except Exception as e: self.debug(str(e), "skipping device_temp_" + gpuIdx) if data["device_mem_used_" + gpuIdx] is None: memUsed = findall( 'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_mem_used_" + gpuIdx] = int(memUsed) self.debug( 'Using legacy mem_used for GPU {0}: {1}'.format( gpuIdx, memUsed)) except Exception as e: self.debug(str(e), "skipping device_mem_used_" + gpuIdx) if data["device_load_gpu_" + gpuIdx] is None: gpu_util = findall( '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1] try: data["device_load_gpu_" + gpuIdx] = int(gpu_util) self.debug( 'Using legacy load_gpu for GPU {0}: {1}'.format( gpuIdx, gpu_util)) except Exception as e: self.debug(str(e), "skipping device_load_gpu_" + gpuIdx) if data["device_load_mem_" + gpuIdx] is None: mem_util = findall( '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2] try: data["device_load_mem_" + gpuIdx] = int(mem_util) self.debug( 'Using legacy load_mem for GPU {0}: {1}'.format( gpuIdx, mem_util)) except Exception as e: self.debug(str(e), "skipping device_load_mem_" + gpuIdx) if data["device_core_clock_" + gpuIdx] is None: clock_core = findall( 'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][1] try: data["device_core_clock_" + gpuIdx] = int(clock_core) self.debug( 'Using legacy core_clock for GPU {0}: {1}'.format( gpuIdx, clock_core)) except Exception as e: self.debug(str(e), "skipping device_core_clock_" + gpuIdx) if data["device_mem_clock_" + gpuIdx] is None: clock_mem = findall( 'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][2] try: data["device_mem_clock_" + gpuIdx] = int(clock_mem) self.debug( 'Using legacy mem_clock for GPU {0}: {1}'.format( gpuIdx, clock_mem)) except Exception as e: self.debug(str(e), "skipping device_mem_clock_" + gpuIdx) return data
def gpu_temp(handle=None, deviceID=0, cmap='cool', **kwargs): temp = nvmlDeviceGetTemperature(handle, deviceID) norm = min(max(temp - 30., 0.), 25.)/25. return plt.get_cmap(cmap)(norm, bytes=True)[:3]