def __check_gpu(self): """ Check if the process list contains GPU processes and determine if GPUs exists. Add GPU processes to the processes list if required.""" if not self.exp.meta_data.plugin_list._contains_gpu_processes(): return try: import pynvml as pv except: logging.debug("pyNVML module not found") raise Exception("pyNVML module not found") try: pv.nvmlInit() count = int(pv.nvmlDeviceGetCount()) logging.debug("%s GPUs have been found.", count) if not self.exp.meta_data.get('test_state'): for i in range(count): handle = pv.nvmlDeviceGetHandleByIndex(i) if pv.nvmlDeviceGetComputeRunningProcesses(handle): raise Exception("Unfortunately, GPU %i is busy. Try \ resubmitting the job to the queue." % i) except Exception as e: raise Exception("Unable to run GPU plugins: %s", e.message) self.__set_gpu_processes(count)
def _get_container_id(self, gpuhandle): cont_ids = [] pids = [] try: proc_objs = pynvml.nvmlDeviceGetComputeRunningProcesses(gpuhandle) if not proc_objs: return ['NA.NA'] for proc_obj in proc_objs: pids.append(proc_obj.pid) for pid in pids: cont_ids.append(self._get_containerid_from_pid(pid)) return cont_ids except pynvml.NVMLError, err: logger.debug('Failed to get pid on gpu: ', err)
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def main(): import pynvml pynvml.nvmlInit() # 这里的0是GPU id handle2 = pynvml.nvmlDeviceGetHandleByIndex(2) handle3 = pynvml.nvmlDeviceGetHandleByIndex(3) # meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) # print(meminfo.used) parser = argparse.ArgumentParser( description='simple 3D convolution for action recognition') parser.add_argument('--batch', type=int, default=128) parser.add_argument('--epoch', type=int, default=100) parser.add_argument('--videos', type=str, default='UCF101', help='directory where videos are stored') parser.add_argument('--nclass', type=int, default=101) parser.add_argument('--output', type=str, required=True) parser.add_argument('--color', type=bool, default=False) parser.add_argument('--skip', type=bool, default=True) parser.add_argument('--depth', type=int, default=10) parser.add_argument('--dataset', type=str, default='ucf101') args = parser.parse_args() img_rows, img_cols, frames = 64, 64, args.depth channel = 3 if args.color else 1 fname_npz = 'dataset_{}_{}_{}_{}.npz'.format(args.dataset, args.nclass, args.depth, args.skip) vid3d = videoto3d.Videoto3D(img_rows, img_cols, frames, args.dataset) nb_classes = args.nclass if os.path.exists(fname_npz): loadeddata = np.load(fname_npz) X, Y = loadeddata["X"], loadeddata["Y"] else: x, y = loaddata(args.videos, vid3d, args.nclass, args.output, args.dataset, frames, args.color, args.skip) X = x.reshape((x.shape[0], img_rows, img_cols, frames, channel)) Y = np_utils.to_categorical(y, nb_classes) X = X.astype('float32') np.savez(fname_npz, X=X, Y=Y) print('Saved dataset to dataset.npz.') print('X_shape:{}\nY_shape:{}'.format(X.shape, Y.shape)) # Define model # conv3D + Relu + Conv3D + Softmax + Pooling3D + DropOut input_x = Input(shape=(img_rows, img_cols, frames, channel)) # # # C3D-conv1 # convLayer = Conv3D(32, kernel_size= (3, 3, 3),padding='same')(input_x) # convLayer = ReLU()(convLayer) # # convLayer = Conv3D(32, kernel_size= (3, 3, 3), padding='same')(convLayer) # convLayer = Softmax()(convLayer) # convLayer = MaxPooling3D(pool_size=(3,3,3), padding='same')(convLayer) # convLayer = Dropout(0.25)(convLayer) # # # C3D-conv2 # convLayer = Conv3D(64, kernel_size= (3, 3, 3),padding='same')(convLayer) # convLayer = ReLU()(convLayer) # # convLayer = Conv3D(64, kernel_size= (3, 3, 3), padding='same')(convLayer) # convLayer = Softmax()(convLayer) # convLayer = MaxPooling3D(pool_size=(3,3,3), padding='same')(convLayer) # convLayer = Dropout(0.25)(convLayer) # # # maskLayer = Conv3D(64*frames, kernel_size=(3,3,2), padding='same')(convLayer) # maskLayer = Lambda(mean_filter)(maskLayer) # [None,1, 64], each point represent a mask of input region of 8x8 points # # maskLayer = BatchNormalization()(maskLayer) # maskLayer = Lambda(K.sigmoid)(maskLayer) # # maskLayer = ReLU()(maskLayer) # # maskLayer = Lambda(bi_trans, arguments={'th':0.5})(maskLayer) # maskLayer = Reshape(( 8, 8, frames, 1))(maskLayer) #reshape_filter(maskLayer, shape=[None,8,8,1,1]) # # maskLayer = Lambda(normalize)(maskLayer) # maskLayerForLoss = maskLayer # maskLayer = Lambda(repeat_filter,arguments={'rep':8, 'axis':1})(maskLayer) # maskLayer = Lambda(repeat_filter,arguments={'rep':8, 'axis':2})(maskLayer) # # maskLayer = Lambda(repeat_filter,arguments={'rep':frames, 'axis':3})(maskLayer) # maskLayer = Lambda(repeat_filter,arguments={'rep':channel, 'axis':4})(maskLayer) # # # maskLayer = Lambda(repeat_filter,arguments={'rep':2, 'axis':3})(maskLayer) # # maskLayer = Lambda(repeat_filter,arguments={'rep':64, 'axis':4})(maskLayer) # # # convLayer = Multiply()([maskLayer,input_x]) # # # C3D-conv1 convLayer = Conv3D(32, kernel_size=(3, 3, 3), padding='same')(input_x) convLayer = ReLU()(convLayer) convLayer = Conv3D(32, kernel_size=(3, 3, 3), padding='same')(convLayer) convLayer = Softmax()(convLayer) convLayer = MaxPooling3D(pool_size=(3, 3, 3), padding='same')(convLayer) convLayer = Dropout(0.25)(convLayer) # C3D-conv2 convLayer = Conv3D(64, kernel_size=(3, 3, 3), padding='same')(convLayer) convLayer = ReLU()(convLayer) convLayer = Conv3D(64, kernel_size=(3, 3, 3), padding='same')(convLayer) convLayer = Softmax()(convLayer) convLayer = MaxPooling3D(pool_size=(3, 3, 3), padding='same')(convLayer) convLayer = Dropout(0.25)(convLayer) fc1 = Flatten()(convLayer) fc = Dense(512, activation='sigmoid')(fc1) fc = Dropout(0.5)(fc) dense_out = Dense(nb_classes, activation='softmax')(fc) dense_out_converse = Dense(nb_classes)(fc) # model = Model(input_x, [dense_out, dense_out_converse]) model = Model(input_x, [dense_out, dense_out_converse]) # loss of 2 parts losses = {'dense_2': K.categorical_crossentropy, 'dense_3': unlikely_loss} lossWeights = {'dense_2': 1, 'dense_3': 1} model.compile(loss=losses, loss_weights=lossWeights, optimizer=Adam(lr=0.001), metrics=['accuracy']) # model.compile(loss=categorical_crossentropy, optimizer=Adam(lr=0.001),metrics=['accuracy']) model.summary() plot_model(model, show_shapes=True, to_file=os.path.join(args.output, 'model.png')) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=43) X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=43) # history = model.fit_generator(myGenerator(X_train, X_test, Y_train, Y_test, nb_classes, args.batch), # samples_per_epoch=X_train.shape[0], epochs=args.epoch, verbose=1, # callbacks=callbacks_list, # shuffle=True) # check GPUs status , once a GPU is available, change os environment parameters and break # is none of the GPUs are ready ,sleep for 2 secs and retry. cnt = 0 while True: cnt += 1 processinfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle2) if len(processinfo) == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '2' print('GPU 2 is available, use GPU 2\n') break processinfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle3) if len(processinfo) == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '3' print('GPU 3 is available, use GPU 3\n') break sleep(2) print('\rretry time: {}'.format(cnt), end='') history = model.fit(X_train, [Y_train, Y_train], validation_data=(X_val, [Y_val, Y_val]), batch_size=args.batch, epochs=args.epoch, verbose=1, shuffle=True) # history = model.fit(X_train, Y_train, # validation_data=(X_val, Y_val), # batch_size=args.batch, # epochs=args.epoch, verbose=1, shuffle=True) # loss, acc = model.evaluate(X_test, Y_test, verbose=0) model_json = model.to_json() if not os.path.isdir(args.output): os.makedirs(args.output) with open( os.path.join( args.output, '{}_{}_{}_ucf101_3dcnnmodel.json'.format( current_time, nb_classes, args.depth)), 'w') as json_file: json_file.write(model_json) model.save_weights( os.path.join( args.output, '{}_{}_{}_ucf101_3dcnnmodel.hd5'.format(current_time, nb_classes, args.depth))) loss = model.evaluate(X_test, [Y_test, Y_test], verbose=0) # loss, acc = model.evaluate(X_test, Y_test, verbose=0) print('Test loss:', loss) plot_history(history, args.output) save_history(history, args.output) print('Test loss:', loss)
def _get_gpu_process(gpu_device): return len(nvmlDeviceGetComputeRunningProcesses(gpu_device))
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.info('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.info('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. process['command'] = os.path.basename(ps_process.cmdline()[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
def get_infos(): """Get all information about all your graphics cards. Returns: dict: The returned result is a dict with 3 keys: count, driver_version and devices: count: Number of gpus found driver_version: The version of the system’s graphics driver devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. It should be noted that the Process field is also a namedtuple which has 11 fields. """ infos = {} Device = namedtuple( "Device", [ "id", "name", "free", "used", "total", "gpu_util", "temperature", "fan_speed", "power_usage", "power_state", "process", ], ) Process = namedtuple( "Process", [ "pid", "memory_percent", "status", "username", "num_threads", "cpu_num", "cpu_percent", "name", "cmdline", "used_gpu_mem", "create_time", ], ) driver_version = pynvml.nvmlSystemGetDriverVersion().decode() device_count = pynvml.nvmlDeviceGetCount() devices = [] for i in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle).decode() mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) power_usage = pynvml.nvmlDeviceGetPowerUsage( handle) # Power usage in milliwatts mW processes = pynvml.nvmlDeviceGetComputeRunningProcesses( handle) # Which processes are using the GPU # process_info = [(item.pid, item.usedGpuMemory) for item in process_info] process_info = [] for p in processes: # append Process object to process_info pid = p.pid used_gpu_mem = p.usedGpuMemory p = psutil.Process(pid=pid) _ = p.cpu_percent() time.sleep(0.05) process_info.append( Process( pid=pid, memory_percent=p.memory_percent(), status=p.status(), username=p.username(), num_threads=p.num_threads(), cpu_num=p.cpu_num(), cpu_percent=p.cpu_percent(), name=p.name(), cmdline=" ".join(p.cmdline()), used_gpu_mem=used_gpu_mem, create_time=p.create_time(), )) try: fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported as e: fan_speed = None power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) power_state = pynvml.nvmlDeviceGetPowerState(handle) temperature = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) # Volatile Gpu-Util in output of nvidia-smi gpu_util = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu devices.append( Device( id=i, name=name, free=mem_info.free, used=mem_info.used, total=mem_info.total, gpu_util=gpu_util, temperature=temperature, fan_speed=fan_speed, power_usage=power_usage, power_state=power_state, process=process_info, )) infos["count"] = device_count infos["driver_version"] = driver_version infos["devices"] = devices return infos
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def _get_data(self): data = {} if self.deviceCount: for i in range(self.deviceCount): gpuIdx = str(i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) brand = pynvml.nvmlDeviceGetBrand(handle) brands = [ 'Unknown', 'Quadro', 'Tesla', 'NVS', 'Grid', 'GeForce' ] ### Get data ### ## Memory usage try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) except Exception as e: self.debug(str(e)) mem = None ## Mapd occupancy try: mapd_occu = 0 procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for p in procs: pid = str(p.pid) with open("/host/proc/" + pid + "/cmdline") as pid_file: for line in pid_file: if line.find("mapd_server") != -1: mapd_occu = p.usedGpuMemory except Exception as e: mapd_occu = 0 ## ECC errors try: _memError = {} _eccCounter = {} eccErrors = {} eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC'] memErrorType = [ 'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED' ] memoryLocationType = [ 'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY' ] for memoryLocation in range(5): for eccCounter in range(2): for memError in range(2): _memError[memErrorType[ memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter( handle, eccCounter, memError, memoryLocation) _eccCounter[eccCounterType[eccCounter]] = _memError eccErrors[ memoryLocationType[memoryLocation]] = _eccCounter except Exception as e: self.debug(str(e)) eccErrors = None ## Temperature try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) except Exception as e: self.debug(str(e)) temp = None ## Fan try: fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception as e: self.debug(str(e)) fanspeed = None ## Utilization try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_util = util.gpu mem_util = util.memory except Exception as e: self.debug(str(e)) gpu_util = None mem_util = None ## Clock frequencies try: clock_core = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_GRAPHICS) clock_sm = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_SM) clock_mem = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor except Exception as e: self.debug(str(e)) clock_core = None clock_sm = None clock_mem = None ### Packing data ### self.debug("Device", gpuIdx, ":", str(name)) data["device_name_" + gpuIdx] = name self.debug("Brand:", str(brands[brand])) ## pack mapd occupation data['mapd_occupancy_' + gpuIdx] = str(mapd_occu) self.debug(str(name), "Temp :", str(temp)) data["device_temp_" + gpuIdx] = temp self.debug(str(name), "Mem total :", str(mem.total), 'bytes') data["device_mem_total_" + gpuIdx] = mem.total self.debug(str(name), "Mem used :", str(mem.used), 'bytes') data["device_mem_used_" + gpuIdx] = mem.used self.debug(str(name), "Mem free :", str(mem.free), 'bytes') data["device_mem_free_" + gpuIdx] = mem.free self.debug(str(name), "Load GPU :", str(gpu_util), '%') data["device_load_gpu_" + gpuIdx] = gpu_util self.debug(str(name), "Load MEM :", str(mem_util), '%') data["device_load_mem_" + gpuIdx] = mem_util self.debug(str(name), "Core clock:", str(clock_core), 'MHz') data["device_core_clock_" + gpuIdx] = clock_core self.debug(str(name), "SM clock :", str(clock_sm), 'MHz') data["device_sm_clock_" + gpuIdx] = clock_sm self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz') data["device_mem_clock_" + gpuIdx] = clock_mem self.debug(str(name), "Fan speed :", str(fanspeed), '%') data["device_fanspeed_" + gpuIdx] = fanspeed self.debug(str(name), "ECC errors:", str(eccErrors)) if eccErrors is not None: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] else: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None ## Get unit (S-class Nvidia cards) data if self.unitCount: for i in range(self.unitCount): gpuIdx = str(i) handle = pynvml.nvmlUnitGetHandleByIndex(i) try: fan = pynvml.nvmlUnitGetFanSpeedInfo(handle) fan_speed = fan.speed # Fan speed (RPM) fan_state = fan.state # Flag that indicates whether fan is working properly except Exception as e: self.debug(str(e)) fan_speed = None fan_state = None try: psu = pynvml.nvmlUnitGetPsuInfo(handle) psu_current = psu.current # PSU current (A) psu_power = psu.power # PSU power draw (W) psu_state = psu.state # The power supply state psu_voltage = psu.voltage # PSU voltage (V) except Exception as e: self.debug(str(e)) psu_current = None psu_power = None psu_state = None psu_voltage = None try: temp_intake = pynvml.nvmlUnitGetTemperature( handle, 0) # Temperature at intake in C temp_exhaust = pynvml.nvmlUnitGetTemperature( handle, 1) # Temperature at exhaust in C temp_board = pynvml.nvmlUnitGetTemperature( handle, 2) # Temperature on board in C except Exception as e: self.debug(str(e)) temp_intake = None temp_exhaust = None temp_board = None self.debug('Unit fan speed:', str(fan_speed)) data["unit_fan_speed_" + gpuIdx] = fan_speed self.debug('Unit fan state:', str(fan_state)) data["unit_fan_state_" + gpuIdx] = fan_state self.debug('Unit PSU current:', str(psu_current)) data["unit_psu_current_" + gpuIdx] = psu_current self.debug('Unit PSU power:', str(psu_power)) data["unit_psu_power_" + gpuIdx] = psu_power self.debug('Unit PSU state:', str(psu_state)) data["unit_psu_state_" + gpuIdx] = psu_state self.debug('Unit PSU voltage:', str(psu_voltage)) data["unit_psu_voltage_" + gpuIdx] = psu_voltage self.debug('Unit temp intake:', str(temp_intake)) data["unit_temp_intake_" + gpuIdx] = temp_intake self.debug('Unit temp exhaust:', str(temp_exhaust)) data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust self.debug('Unit temp board:', str(temp_board)) data["unit_temp_board_" + gpuIdx] = temp_board ## Get data via legacy mode if self.legacy: try: output, error = Popen([ "nvidia-settings", "-c", ":0", "-q", "GPUUtilization", "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q", "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory" ], shell=False, stdout=PIPE, stderr=PIPE).communicate() output = repr(str(output)) if len(output) < 800: raise Exception( 'Error in fetching data from nvidia-settings ' + output) self.debug(str(error), output) except Exception as e: self.error(str(e)) self.error('Setting legacy mode to False') self.legacy = False return data for i in range(self.deviceCount): gpuIdx = str(i) if data["device_temp_" + gpuIdx] is None: coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_temp_" + gpuIdx] = int(coreTemp) self.debug('Using legacy temp for GPU {0}: {1}'.format( gpuIdx, coreTemp)) except Exception as e: self.debug(str(e), "skipping device_temp_" + gpuIdx) if data["device_mem_used_" + gpuIdx] is None: memUsed = findall( 'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_mem_used_" + gpuIdx] = int(memUsed) self.debug( 'Using legacy mem_used for GPU {0}: {1}'.format( gpuIdx, memUsed)) except Exception as e: self.debug(str(e), "skipping device_mem_used_" + gpuIdx) if data["device_load_gpu_" + gpuIdx] is None: gpu_util = findall( '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1] try: data["device_load_gpu_" + gpuIdx] = int(gpu_util) self.debug( 'Using legacy load_gpu for GPU {0}: {1}'.format( gpuIdx, gpu_util)) except Exception as e: self.debug(str(e), "skipping device_load_gpu_" + gpuIdx) if data["device_load_mem_" + gpuIdx] is None: mem_util = findall( '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2] try: data["device_load_mem_" + gpuIdx] = int(mem_util) self.debug( 'Using legacy load_mem for GPU {0}: {1}'.format( gpuIdx, mem_util)) except Exception as e: self.debug(str(e), "skipping device_load_mem_" + gpuIdx) if data["device_core_clock_" + gpuIdx] is None: clock_core = findall( 'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][1] try: data["device_core_clock_" + gpuIdx] = int(clock_core) self.debug( 'Using legacy core_clock for GPU {0}: {1}'.format( gpuIdx, clock_core)) except Exception as e: self.debug(str(e), "skipping device_core_clock_" + gpuIdx) if data["device_mem_clock_" + gpuIdx] is None: clock_mem = findall( 'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][2] try: data["device_mem_clock_" + gpuIdx] = int(clock_mem) self.debug( 'Using legacy mem_clock for GPU {0}: {1}'.format( gpuIdx, clock_mem)) except Exception as e: self.debug(str(e), "skipping device_mem_clock_" + gpuIdx) return data
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 # Number of active GPUs self.gauge('nvml.gpus.number', deviceCount) for device_id in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetTemperature:{}'.format(err)) # power info try: pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000 self.gauge('nvml.power.', pwr, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetPowerUsage:{}'.format(err)) # fan info try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) self.gauge('nvml.fan.', fan, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetFanSpeed:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % int(util_encoder[0])) self.gauge('nvml.util.encoder', int( util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % int(util_decoder[0])) self.gauge('nvml.util.decoder', int( util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) # Clocks throttling info # Divide by the mask so that the value is either 0 or 1 per GPU try: throttle_reasons = ( pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)) self.gauge('nvml.throttle.appsettings', (throttle_reasons & pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting) / pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting, tags=d_tags) self.gauge('nvml.throttle.display', (throttle_reasons & GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS) / GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS, tags=d_tags) self.gauge('nvml.throttle.hardware', (throttle_reasons & pynvml.nvmlClocksThrottleReasonHwSlowdown) / pynvml.nvmlClocksThrottleReasonHwSlowdown, tags=d_tags) self.gauge('nvml.throttle.idle', (throttle_reasons & pynvml.nvmlClocksThrottleReasonGpuIdle) / pynvml.nvmlClocksThrottleReasonGpuIdle, tags=d_tags) self.gauge('nvml.throttle.power.hardware', (throttle_reasons & GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE) / GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE, tags=d_tags) self.gauge('nvml.throttle.power.software', (throttle_reasons & pynvml.nvmlClocksThrottleReasonSwPowerCap) / pynvml.nvmlClocksThrottleReasonSwPowerCap, tags=d_tags) self.gauge('nvml.throttle.syncboost', (throttle_reasons & GPU_THROTTLE_SYNCBOOST) / GPU_THROTTLE_SYNCBOOST, tags=d_tags) self.gauge('nvml.throttle.temp.hardware', (throttle_reasons & GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE) / GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE, tags=d_tags) self.gauge('nvml.throttle.temp.software', (throttle_reasons & GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE) / GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE, tags=d_tags) self.gauge('nvml.throttle.unknown', (throttle_reasons & pynvml.nvmlClocksThrottleReasonUnknown) / pynvml.nvmlClocksThrottleReasonUnknown, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetCurrentClocksThrottleReasons:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = ','.join(msg_list) else: status = AgentCheck.OK msg = 'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
mem_used = '{:<6}'.format(str(mem_info.used // mega) + 'M') show_str_lst.append(' ' + mem_used + '/' + mem_total) # 温度 card_temp = ' ' + str( pml.nvmlDeviceGetTemperature(handle, pml.NVML_TEMPERATURE_GPU)) + 'C' show_str_lst.append(card_temp) # 利用率 card_util_ratio = ' {:>3}'.format( pml.nvmlDeviceGetUtilizationRates(handle).gpu) + '%' show_str_lst.append(card_util_ratio) # 进程占用情况 p_str = '' procs = pml.nvmlDeviceGetComputeRunningProcesses(handle) for j, p in enumerate(procs): #pid = ' ' + str(p.pid) + ' ' pid = '{:<7}'.format(p.pid) p_name = bytes.decode(pml.nvmlSystemGetProcessName(p.pid)) p_name = ' {:<10} '.format(p_name) p_mem_used = ' ' + str(p.usedGpuMemory // mega) + 'M' pc = psutil.Process(procs[0].pid) p_user = '******'.format(pc.username()) p_str = ' ' + pid + p_name + p_user + p_mem_used if j == 0: show_str_lst.append(p_str) else:
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # power info try: pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000 self.gauge('nvml.power.', pwr, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetPowerUsage:{}'.format(err)) # fan info try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) self.gauge('nvml.fan.', fan, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetFanSpeed:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long( util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long( util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: minor = int(N.nvmlDeviceGetMinorNumber(handle)) except N.NVMLError: minor = None # Not supported try: bus_id = _decode(N.nvmlDeviceGetPciInfo(handle).busId) except N.NVMLError: bus_id = None # Not supported try: serial = _decode(N.nvmlDeviceGetSerial(handle)) except N.NVMLError: serial = None # Not supported try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except (N.NVMLError, N.NVMLError_NotSupported): power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except (N.NVMLError, N.NVMLError_NotSupported): power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'minor': minor, 'bus_id': bus_id, 'serial': serial, 'temperature_gpu': temperature, 'utilization_gpu': utilization.gpu if utilization else None, 'power_draw': int(power / 1000) if power is not None else None, 'power_limit': int(power_limit / 1000) if power is not None else None, 'memory_free': int(memory.free) if memory else None, 'memory_used': int(memory.used) if memory else None, 'memory_total': int(memory.total) if memory else None, 'memory_utilization': utilization.memory if utilization else None, 'processes': processes, } return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid # For docker cmd = 'cat /proc/{}/cgroup'.format(nv_process.pid) ret = subprocess.check_output(cmd.split()) container_id = str(ret).split('/')[2][:12] process['container_id'] = container_id cmd = 'docker ps -a' ret = subprocess.check_output(cmd.split()) docker_data = str(ret).split('\\n')[1:-1] for personal in docker_data: personal_data = personal.split() if container_id == personal_data[0]: process['container_user_name'] = personal_data[-1] return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses( handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # _cmdline = ps_process.cmdline() # if not _cmdline: # process['command'] = '?' # process['full_command'] = ['?'] # else: # process['command'] = os.path.basename(_cmdline[0]) # process['full_command'] = _cmdline # process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB # process['cpu_percent'] = ps_process.cpu_percent() # process['cpu_memory_usage'] = \ # round((ps_process.memory_percent() / 100.0) * # psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) # try: # temperature = N.nvmlDeviceGetTemperature( # handle, N.NVML_TEMPERATURE_GPU # ) # except N.NVMLError: # temperature = None # Not supported # try: # fan_speed = N.nvmlDeviceGetFanSpeed(handle) # except N.NVMLError: # fan_speed = None # Not supported # try: # memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes # except N.NVMLError: # memory = None # Not supported # try: # utilization = N.nvmlDeviceGetUtilizationRates(handle) # except N.NVMLError: # utilization = None # Not supported # try: # power = N.nvmlDeviceGetPowerUsage(handle) # except N.NVMLError: # power = None # try: # power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) # except N.NVMLError: # power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] # process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, # 'temperature.gpu': temperature, # 'fan.speed': fan_speed, # 'utilization.gpu': utilization.gpu if utilization else None, # 'power.draw': power // 1000 if power is not None else None, # 'enforced.power.limit': power_limit // 1000 # if power_limit is not None else None, # Convert bytes into MBytes # 'memory.used': memory.used // MB if memory else None, # 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def pids(handle): return [ p.pid for p in pynvml.nvmlDeviceGetComputeRunningProcesses(handle) ]
sysdata['ssd1_exist'] = False procs = deviceCount * [None] gpu_error = deviceCount * [False] for i in range(deviceCount): try: handle = nvmlDeviceGetHandleByIndex(i) name = nvmlDeviceGetName(handle) gpudata[i]['name'] = name.decode('utf-8') memInfo = nvmlDeviceGetMemoryInfo(handle) gpudata[i]['mem_free'] = toMB(memInfo.total - memInfo.used) gpudata[i]['mem_total'] = toMB(memInfo.total) gpudata[i]['mem_usage'] = memInfo.used / memInfo.total * 100 procs_prefilter = nvmlDeviceGetComputeRunningProcesses(handle) # for unknown reasons, nvmlDeviceGetComputeRunningProcesses # sometimes returns nonexistent processes on 3090 GPUs procs[i] = [] gpudata[i]['procs'] = [] for p in procs_prefilter: try: P = psutil.Process(p.pid) procs[i].append(p) gpudata[i]['procs'].append((p.pid, ) + getprocinfo(P)) except psutil.NoSuchProcess: pass except Exception as e: gpu_error[i] = True print('Unable to access GPU device (id: %d)' % i) print(e)
def get_gpu_info(handle, mig_handle=None): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(mig_handle if mig_handle else handle)) uuid = _decode(N.nvmlDeviceGetUUID(mig_handle if mig_handle else handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError as e: log.add_exception("temperature", e) temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError as e: log.add_exception("fan_speed", e) fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(mig_handle if mig_handle else handle) # in Bytes except N.NVMLError as e: log.add_exception("memory", e) memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError as e: log.add_exception("utilization", e) utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError as e: log.add_exception("utilization_enc", e) utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError as e: log.add_exception("utilization_dnc", e) utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError as e: log.add_exception("power", e) power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError as e: log.add_exception("power_limit", e) power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(mig_handle if mig_handle else handle) except N.NVMLError as e: log.add_exception("compute_processes", e) nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(mig_handle if mig_handle else handle) except N.NVMLError as e: log.add_exception("graphics_processes", e) nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = str(N.nvmlDeviceGetIndex(handle)) if mig_handle: index += ':' + str(N.nvmlDeviceGetIndex(mig_handle)) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def getprocs(): for i in range(nv.nvmlDeviceGetCount()): hdl = nv.nvmlDeviceGetHandleByIndex(i) for p in nv.nvmlDeviceGetComputeRunningProcesses(hdl): yield p.pid
import pynvml as nv nv.nvmlInit() for i in range(nv.nvmlDeviceGetCount()): hndl = nv.nvmlDeviceGetHandleByIndex(i) if not nv.nvmlDeviceGetComputeRunningProcesses(hndl): visable_device = str(i) break nv.nvmlShutdown() import tensorflow as tf import os try: os.environ["CUDA_VISIBLE_DEVICES"] = visable_device except: print('No available gpu') exit() import numpy as np np.set_printoptions(precision=4, suppress=True, linewidth=100) import tensorflow.contrib.layers as ly from mnist import read_data_sets from mnist import dense_to_one_hot clabel = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] dataset = read_data_sets('MNIST_data', one_hot=False, validation_size=0, clabel=clabel) n_rdims = 19 n_classes = 20 n_features = 32
# power state power_used = nvmlDeviceGetPowerUsage(handle) / 1000 power_limit = nvmlDeviceGetPowerManagementDefaultLimit(handle) / 1000 power_used = int(power_used) power_limit = int(power_limit) power_rate = int(power_used / power_limit * 100) msg = pack_msg([power_used, power_limit], 'W') info['Power Util'] = getBar(power_rate, msg) # fan speed, temperature fan_speed = nvmlDeviceGetFanSpeed(handle) temp = nvmlDeviceGetTemperature(handle, 0) msg = f"{temp}C" info['Fan Speed'] = getBar(fan_speed, msg) message = [f"{k} \t{v}" for k, v in info.items()] print('\n'.join(message)) # graphic processes graphic_processes = nvmlDeviceGetGraphicsRunningProcesses(handle) header = "\n=== Graphic Processes ===" show_process(header, graphic_processes) # graphic processes compute_processes = nvmlDeviceGetComputeRunningProcesses(handle) header = "\n=== Compute Processes ===" show_process(header, compute_processes) nvmlShutdown()
def _summary(self): summary = [] summary.append("GPU running Processes:") initGPU() try: gpusToUse = [int(n) for n in (self.gpusToUse.get()).split()] for i in gpusToUse: handle = nvmlDeviceGetHandleByIndex(i) cps = nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: # p_tags['pid'] = ps.pid msg = " %d) " % i + psutil.Process(ps.pid).name() msg += " (mem =%.2f MB)" % (float(ps.usedGpuMemory) / 1048576.) summary.append(msg) except NVMLError as err: summary.append(str(err)) return summary