Exemplo n.º 1
0
    def __check_gpu(self):
        """ Check if the process list contains GPU processes and determine if
        GPUs exists. Add GPU processes to the processes list if required."""
        if not self.exp.meta_data.plugin_list._contains_gpu_processes():
            return

        try:
            import pynvml as pv
        except:
            logging.debug("pyNVML module not found")
            raise Exception("pyNVML module not found")

        try:
            pv.nvmlInit()
            count = int(pv.nvmlDeviceGetCount())
            logging.debug("%s GPUs have been found.", count)

            if not self.exp.meta_data.get('test_state'):
                for i in range(count):
                    handle = pv.nvmlDeviceGetHandleByIndex(i)
                    if pv.nvmlDeviceGetComputeRunningProcesses(handle):
                        raise Exception("Unfortunately, GPU %i is busy. Try \
                            resubmitting the job to the queue." % i)
        except Exception as e:
            raise Exception("Unable to run GPU plugins: %s", e.message)
        self.__set_gpu_processes(count)
 def _get_container_id(self, gpuhandle):
     cont_ids = []
     pids = []
     try:
         proc_objs = pynvml.nvmlDeviceGetComputeRunningProcesses(gpuhandle)
         if not proc_objs:
             return ['NA.NA']
         for proc_obj in proc_objs:
             pids.append(proc_obj.pid)
         for pid in pids:
             cont_ids.append(self._get_containerid_from_pid(pid))
         return cont_ids
     except pynvml.NVMLError, err:
         logger.debug('Failed to get pid on gpu: ', err)
Exemplo n.º 3
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info
Exemplo n.º 4
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            return gpu_info
Exemplo n.º 5
0
def main():
    import pynvml
    pynvml.nvmlInit()
    # 这里的0是GPU id
    handle2 = pynvml.nvmlDeviceGetHandleByIndex(2)
    handle3 = pynvml.nvmlDeviceGetHandleByIndex(3)
    # meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)

    # print(meminfo.used)

    parser = argparse.ArgumentParser(
        description='simple 3D convolution for action recognition')
    parser.add_argument('--batch', type=int, default=128)
    parser.add_argument('--epoch', type=int, default=100)
    parser.add_argument('--videos',
                        type=str,
                        default='UCF101',
                        help='directory where videos are stored')
    parser.add_argument('--nclass', type=int, default=101)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--color', type=bool, default=False)
    parser.add_argument('--skip', type=bool, default=True)
    parser.add_argument('--depth', type=int, default=10)
    parser.add_argument('--dataset', type=str, default='ucf101')
    args = parser.parse_args()

    img_rows, img_cols, frames = 64, 64, args.depth
    channel = 3 if args.color else 1
    fname_npz = 'dataset_{}_{}_{}_{}.npz'.format(args.dataset, args.nclass,
                                                 args.depth, args.skip)

    vid3d = videoto3d.Videoto3D(img_rows, img_cols, frames, args.dataset)
    nb_classes = args.nclass

    if os.path.exists(fname_npz):
        loadeddata = np.load(fname_npz)
        X, Y = loadeddata["X"], loadeddata["Y"]
    else:
        x, y = loaddata(args.videos, vid3d, args.nclass, args.output,
                        args.dataset, frames, args.color, args.skip)
        X = x.reshape((x.shape[0], img_rows, img_cols, frames, channel))
        Y = np_utils.to_categorical(y, nb_classes)

        X = X.astype('float32')
        np.savez(fname_npz, X=X, Y=Y)
        print('Saved dataset to dataset.npz.')
    print('X_shape:{}\nY_shape:{}'.format(X.shape, Y.shape))

    # Define model

    # conv3D + Relu + Conv3D + Softmax + Pooling3D + DropOut
    input_x = Input(shape=(img_rows, img_cols, frames, channel))

    #
    # # C3D-conv1
    # convLayer = Conv3D(32, kernel_size= (3, 3, 3),padding='same')(input_x)
    # convLayer = ReLU()(convLayer)
    #
    # convLayer = Conv3D(32, kernel_size= (3, 3, 3), padding='same')(convLayer)
    # convLayer = Softmax()(convLayer)
    # convLayer = MaxPooling3D(pool_size=(3,3,3), padding='same')(convLayer)
    # convLayer = Dropout(0.25)(convLayer)
    #
    # # C3D-conv2
    # convLayer = Conv3D(64, kernel_size= (3, 3, 3),padding='same')(convLayer)
    # convLayer = ReLU()(convLayer)
    #
    # convLayer = Conv3D(64, kernel_size= (3, 3, 3), padding='same')(convLayer)
    # convLayer = Softmax()(convLayer)
    # convLayer = MaxPooling3D(pool_size=(3,3,3), padding='same')(convLayer)
    # convLayer = Dropout(0.25)(convLayer)
    #
    #
    # maskLayer = Conv3D(64*frames, kernel_size=(3,3,2), padding='same')(convLayer)
    # maskLayer = Lambda(mean_filter)(maskLayer)      # [None,1, 64], each point represent a mask of input region of 8x8 points
    # # maskLayer = BatchNormalization()(maskLayer)
    # maskLayer = Lambda(K.sigmoid)(maskLayer)
    # # maskLayer = ReLU()(maskLayer)
    # # maskLayer = Lambda(bi_trans, arguments={'th':0.5})(maskLayer)
    # maskLayer = Reshape(( 8, 8, frames, 1))(maskLayer)  #reshape_filter(maskLayer, shape=[None,8,8,1,1])
    # # maskLayer = Lambda(normalize)(maskLayer)
    # maskLayerForLoss = maskLayer
    # maskLayer = Lambda(repeat_filter,arguments={'rep':8, 'axis':1})(maskLayer)
    # maskLayer = Lambda(repeat_filter,arguments={'rep':8, 'axis':2})(maskLayer)
    # # maskLayer = Lambda(repeat_filter,arguments={'rep':frames, 'axis':3})(maskLayer)
    # maskLayer = Lambda(repeat_filter,arguments={'rep':channel, 'axis':4})(maskLayer)
    #
    # # maskLayer = Lambda(repeat_filter,arguments={'rep':2, 'axis':3})(maskLayer)
    # # maskLayer = Lambda(repeat_filter,arguments={'rep':64, 'axis':4})(maskLayer)
    #
    #
    # convLayer = Multiply()([maskLayer,input_x])
    #
    #

    # C3D-conv1
    convLayer = Conv3D(32, kernel_size=(3, 3, 3), padding='same')(input_x)
    convLayer = ReLU()(convLayer)

    convLayer = Conv3D(32, kernel_size=(3, 3, 3), padding='same')(convLayer)
    convLayer = Softmax()(convLayer)
    convLayer = MaxPooling3D(pool_size=(3, 3, 3), padding='same')(convLayer)
    convLayer = Dropout(0.25)(convLayer)

    # C3D-conv2
    convLayer = Conv3D(64, kernel_size=(3, 3, 3), padding='same')(convLayer)
    convLayer = ReLU()(convLayer)

    convLayer = Conv3D(64, kernel_size=(3, 3, 3), padding='same')(convLayer)
    convLayer = Softmax()(convLayer)
    convLayer = MaxPooling3D(pool_size=(3, 3, 3), padding='same')(convLayer)
    convLayer = Dropout(0.25)(convLayer)

    fc1 = Flatten()(convLayer)

    fc = Dense(512, activation='sigmoid')(fc1)
    fc = Dropout(0.5)(fc)
    dense_out = Dense(nb_classes, activation='softmax')(fc)
    dense_out_converse = Dense(nb_classes)(fc)

    # model = Model(input_x, [dense_out, dense_out_converse])
    model = Model(input_x, [dense_out, dense_out_converse])

    # loss of 2 parts
    losses = {'dense_2': K.categorical_crossentropy, 'dense_3': unlikely_loss}
    lossWeights = {'dense_2': 1, 'dense_3': 1}
    model.compile(loss=losses,
                  loss_weights=lossWeights,
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'])
    # model.compile(loss=categorical_crossentropy, optimizer=Adam(lr=0.001),metrics=['accuracy'])
    model.summary()
    plot_model(model,
               show_shapes=True,
               to_file=os.path.join(args.output, 'model.png'))

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.1,
                                                        random_state=43)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train,
                                                      Y_train,
                                                      test_size=0.1,
                                                      random_state=43)

    # history = model.fit_generator(myGenerator(X_train, X_test, Y_train, Y_test, nb_classes, args.batch),
    #                               samples_per_epoch=X_train.shape[0], epochs=args.epoch, verbose=1,
    #                               callbacks=callbacks_list,
    # shuffle=True)
    # check GPUs status , once a GPU is available, change os environment parameters and break
    # is none of the GPUs are ready ,sleep for 2 secs and retry.
    cnt = 0
    while True:
        cnt += 1
        processinfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle2)
        if len(processinfo) == 0:
            os.environ['CUDA_VISIBLE_DEVICES'] = '2'
            print('GPU 2 is available, use GPU 2\n')
            break
        processinfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle3)
        if len(processinfo) == 0:
            os.environ['CUDA_VISIBLE_DEVICES'] = '3'
            print('GPU 3 is available, use GPU 3\n')
            break
        sleep(2)
        print('\rretry time: {}'.format(cnt), end='')

    history = model.fit(X_train, [Y_train, Y_train],
                        validation_data=(X_val, [Y_val, Y_val]),
                        batch_size=args.batch,
                        epochs=args.epoch,
                        verbose=1,
                        shuffle=True)
    # history = model.fit(X_train, Y_train,
    #                     validation_data=(X_val, Y_val),
    #                     batch_size=args.batch,
    #                     epochs=args.epoch, verbose=1, shuffle=True)
    # loss, acc = model.evaluate(X_test, Y_test, verbose=0)
    model_json = model.to_json()
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    with open(
            os.path.join(
                args.output, '{}_{}_{}_ucf101_3dcnnmodel.json'.format(
                    current_time, nb_classes, args.depth)), 'w') as json_file:
        json_file.write(model_json)
    model.save_weights(
        os.path.join(
            args.output,
            '{}_{}_{}_ucf101_3dcnnmodel.hd5'.format(current_time, nb_classes,
                                                    args.depth)))
    loss = model.evaluate(X_test, [Y_test, Y_test], verbose=0)
    # loss, acc = model.evaluate(X_test, Y_test, verbose=0)
    print('Test loss:', loss)
    plot_history(history, args.output)
    save_history(history, args.output)

    print('Test loss:', loss)
Exemplo n.º 6
0
def _get_gpu_process(gpu_device):
    return len(nvmlDeviceGetComputeRunningProcesses(gpu_device))
Exemplo n.º 7
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Exemplo n.º 8
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(pid):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=pid)
                process['username'] = ps_process.username()
                # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
                process['command'] = os.path.basename(ps_process.cmdline()[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024)
                process['pid'] = nv_process.pid
                return process

            def _decode(b):
                if isinstance(b, bytes):
                    return b.decode()    # for python3, to unicode
                return b

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            processes = []
            try:
                nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None   # Not supported (in both cases)
            else:
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in (nv_comp_processes + nv_graphics_processes):
                    # TODO: could be more information such as system memory usage,
                    # CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process.pid)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                # Convert bytes into MBytes
                'memory.used': int(memory.used / 1024 / 1024) if memory else None,
                'memory.total': int(memory.total / 1024 / 1024) if memory else None,
                'processes': processes,
            }
            return gpu_info
Exemplo n.º 9
0
def get_infos():
    """Get all information about all your graphics cards.

    Returns:
        dict: The returned result is a dict with 3 keys: count, driver_version and devices:
            count: Number of gpus found
            driver_version: The version of the system’s graphics driver
            devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. 
                     It should be noted that the Process field is also a namedtuple which has 11 fields.
    """

    infos = {}
    Device = namedtuple(
        "Device",
        [
            "id",
            "name",
            "free",
            "used",
            "total",
            "gpu_util",
            "temperature",
            "fan_speed",
            "power_usage",
            "power_state",
            "process",
        ],
    )
    Process = namedtuple(
        "Process",
        [
            "pid",
            "memory_percent",
            "status",
            "username",
            "num_threads",
            "cpu_num",
            "cpu_percent",
            "name",
            "cmdline",
            "used_gpu_mem",
            "create_time",
        ],
    )
    driver_version = pynvml.nvmlSystemGetDriverVersion().decode()
    device_count = pynvml.nvmlDeviceGetCount()
    devices = []
    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle).decode()
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        power_usage = pynvml.nvmlDeviceGetPowerUsage(
            handle)  # Power usage in milliwatts mW
        processes = pynvml.nvmlDeviceGetComputeRunningProcesses(
            handle)  # Which processes are using the GPU
        # process_info = [(item.pid, item.usedGpuMemory) for item in process_info]
        process_info = []
        for p in processes:
            # append Process object to process_info
            pid = p.pid
            used_gpu_mem = p.usedGpuMemory
            p = psutil.Process(pid=pid)
            _ = p.cpu_percent()
            time.sleep(0.05)
            process_info.append(
                Process(
                    pid=pid,
                    memory_percent=p.memory_percent(),
                    status=p.status(),
                    username=p.username(),
                    num_threads=p.num_threads(),
                    cpu_num=p.cpu_num(),
                    cpu_percent=p.cpu_percent(),
                    name=p.name(),
                    cmdline=" ".join(p.cmdline()),
                    used_gpu_mem=used_gpu_mem,
                    create_time=p.create_time(),
                ))
        try:
            fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle)
        except pynvml.NVMLError_NotSupported as e:
            fan_speed = None
        power_usage = pynvml.nvmlDeviceGetPowerUsage(handle)
        power_state = pynvml.nvmlDeviceGetPowerState(handle)
        temperature = pynvml.nvmlDeviceGetTemperature(
            handle, pynvml.NVML_TEMPERATURE_GPU)
        # Volatile Gpu-Util in output of nvidia-smi
        gpu_util = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
        devices.append(
            Device(
                id=i,
                name=name,
                free=mem_info.free,
                used=mem_info.used,
                total=mem_info.total,
                gpu_util=gpu_util,
                temperature=temperature,
                fan_speed=fan_speed,
                power_usage=power_usage,
                power_state=power_state,
                process=process_info,
            ))

    infos["count"] = device_count
    infos["driver_version"] = driver_version
    infos["devices"] = devices
    return infos
Exemplo n.º 10
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                          nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'utilization.enc':
                utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                utilization_dec[0] if utilization_dec else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Exemplo n.º 11
0
    def _get_data(self):
        data = {}

        if self.deviceCount:
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                name = pynvml.nvmlDeviceGetName(handle)
                brand = pynvml.nvmlDeviceGetBrand(handle)
                brands = [
                    'Unknown', 'Quadro', 'Tesla', 'NVS', 'Grid', 'GeForce'
                ]

                ### Get data ###
                ## Memory usage
                try:
                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                except Exception as e:
                    self.debug(str(e))
                    mem = None
                ## Mapd occupancy
                try:
                    mapd_occu = 0
                    procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                    for p in procs:
                        pid = str(p.pid)
                        with open("/host/proc/" + pid +
                                  "/cmdline") as pid_file:
                            for line in pid_file:
                                if line.find("mapd_server") != -1:
                                    mapd_occu = p.usedGpuMemory
                except Exception as e:
                    mapd_occu = 0

                ## ECC errors
                try:
                    _memError = {}
                    _eccCounter = {}
                    eccErrors = {}
                    eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC']
                    memErrorType = [
                        'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED'
                    ]
                    memoryLocationType = [
                        'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY',
                        'REGISTER_FILE', 'TEXTURE_MEMORY'
                    ]
                    for memoryLocation in range(5):
                        for eccCounter in range(2):
                            for memError in range(2):
                                _memError[memErrorType[
                                    memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(
                                        handle, eccCounter, memError,
                                        memoryLocation)
                            _eccCounter[eccCounterType[eccCounter]] = _memError
                        eccErrors[
                            memoryLocationType[memoryLocation]] = _eccCounter
                except Exception as e:
                    self.debug(str(e))
                    eccErrors = None

                ## Temperature
                try:
                    temp = pynvml.nvmlDeviceGetTemperature(
                        handle, pynvml.NVML_TEMPERATURE_GPU)
                except Exception as e:
                    self.debug(str(e))
                    temp = None

                ## Fan
                try:
                    fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle)
                except Exception as e:
                    self.debug(str(e))
                    fanspeed = None

                ## Utilization
                try:
                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    gpu_util = util.gpu
                    mem_util = util.memory
                except Exception as e:
                    self.debug(str(e))
                    gpu_util = None
                    mem_util = None

                ## Clock frequencies
                try:
                    clock_core = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_GRAPHICS)
                    clock_sm = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_SM)
                    clock_mem = pynvml.nvmlDeviceGetClockInfo(
                        handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor
                except Exception as e:
                    self.debug(str(e))
                    clock_core = None
                    clock_sm = None
                    clock_mem = None

                ### Packing data ###
                self.debug("Device", gpuIdx, ":", str(name))
                data["device_name_" + gpuIdx] = name

                self.debug("Brand:", str(brands[brand]))

                ## pack mapd occupation
                data['mapd_occupancy_' + gpuIdx] = str(mapd_occu)

                self.debug(str(name), "Temp      :", str(temp))
                data["device_temp_" + gpuIdx] = temp

                self.debug(str(name), "Mem total :", str(mem.total), 'bytes')
                data["device_mem_total_" + gpuIdx] = mem.total

                self.debug(str(name), "Mem used  :", str(mem.used), 'bytes')
                data["device_mem_used_" + gpuIdx] = mem.used

                self.debug(str(name), "Mem free  :", str(mem.free), 'bytes')
                data["device_mem_free_" + gpuIdx] = mem.free

                self.debug(str(name), "Load GPU  :", str(gpu_util), '%')
                data["device_load_gpu_" + gpuIdx] = gpu_util

                self.debug(str(name), "Load MEM  :", str(mem_util), '%')
                data["device_load_mem_" + gpuIdx] = mem_util

                self.debug(str(name), "Core clock:", str(clock_core), 'MHz')
                data["device_core_clock_" + gpuIdx] = clock_core

                self.debug(str(name), "SM clock  :", str(clock_sm), 'MHz')
                data["device_sm_clock_" + gpuIdx] = clock_sm

                self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz')
                data["device_mem_clock_" + gpuIdx] = clock_mem

                self.debug(str(name), "Fan speed :", str(fanspeed), '%')
                data["device_fanspeed_" + gpuIdx] = fanspeed

                self.debug(str(name), "ECC errors:", str(eccErrors))
                if eccErrors is not None:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_CORRECTED"]
                    data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" +
                         gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][
                             "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["DEVICE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][
                            "ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_"
                         + gpuIdx] = eccErrors["REGISTER_FILE"][
                             "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["REGISTER_FILE"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_"
                         + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                             "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"]
                    data[
                        "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_"
                        + gpuIdx] = eccErrors["TEXTURE_MEMORY"][
                            "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"]
                else:
                    data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" +
                         gpuIdx] = None

        ## Get unit (S-class Nvidia cards) data
        if self.unitCount:
            for i in range(self.unitCount):
                gpuIdx = str(i)
                handle = pynvml.nvmlUnitGetHandleByIndex(i)

                try:
                    fan = pynvml.nvmlUnitGetFanSpeedInfo(handle)
                    fan_speed = fan.speed  # Fan speed (RPM)
                    fan_state = fan.state  # Flag that indicates whether fan is working properly
                except Exception as e:
                    self.debug(str(e))
                    fan_speed = None
                    fan_state = None

                try:
                    psu = pynvml.nvmlUnitGetPsuInfo(handle)
                    psu_current = psu.current  # PSU current (A)
                    psu_power = psu.power  # PSU power draw (W)
                    psu_state = psu.state  # The power supply state
                    psu_voltage = psu.voltage  # PSU voltage (V)
                except Exception as e:
                    self.debug(str(e))
                    psu_current = None
                    psu_power = None
                    psu_state = None
                    psu_voltage = None

                try:
                    temp_intake = pynvml.nvmlUnitGetTemperature(
                        handle, 0)  # Temperature at intake in C
                    temp_exhaust = pynvml.nvmlUnitGetTemperature(
                        handle, 1)  # Temperature at exhaust in C
                    temp_board = pynvml.nvmlUnitGetTemperature(
                        handle, 2)  # Temperature on board in C
                except Exception as e:
                    self.debug(str(e))
                    temp_intake = None
                    temp_exhaust = None
                    temp_board = None

                self.debug('Unit fan speed:', str(fan_speed))
                data["unit_fan_speed_" + gpuIdx] = fan_speed

                self.debug('Unit fan state:', str(fan_state))
                data["unit_fan_state_" + gpuIdx] = fan_state

                self.debug('Unit PSU current:', str(psu_current))
                data["unit_psu_current_" + gpuIdx] = psu_current

                self.debug('Unit PSU power:', str(psu_power))
                data["unit_psu_power_" + gpuIdx] = psu_power

                self.debug('Unit PSU state:', str(psu_state))
                data["unit_psu_state_" + gpuIdx] = psu_state

                self.debug('Unit PSU voltage:', str(psu_voltage))
                data["unit_psu_voltage_" + gpuIdx] = psu_voltage

                self.debug('Unit temp intake:', str(temp_intake))
                data["unit_temp_intake_" + gpuIdx] = temp_intake

                self.debug('Unit temp exhaust:', str(temp_exhaust))
                data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust

                self.debug('Unit temp board:', str(temp_board))
                data["unit_temp_board_" + gpuIdx] = temp_board

        ## Get data via legacy mode
        if self.legacy:
            try:
                output, error = Popen([
                    "nvidia-settings", "-c", ":0", "-q", "GPUUtilization",
                    "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q",
                    "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory"
                ],
                                      shell=False,
                                      stdout=PIPE,
                                      stderr=PIPE).communicate()
                output = repr(str(output))
                if len(output) < 800:
                    raise Exception(
                        'Error in fetching data from nvidia-settings ' +
                        output)
                self.debug(str(error), output)
            except Exception as e:
                self.error(str(e))
                self.error('Setting legacy mode to False')
                self.legacy = False
                return data
            for i in range(self.deviceCount):
                gpuIdx = str(i)
                if data["device_temp_" + gpuIdx] is None:
                    coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)',
                                       output)[i][1]
                    try:
                        data["device_temp_" + gpuIdx] = int(coreTemp)
                        self.debug('Using legacy temp for GPU {0}: {1}'.format(
                            gpuIdx, coreTemp))
                    except Exception as e:
                        self.debug(str(e), "skipping device_temp_" + gpuIdx)
                if data["device_mem_used_" + gpuIdx] is None:
                    memUsed = findall(
                        'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)',
                        output)[i][1]
                    try:
                        data["device_mem_used_" + gpuIdx] = int(memUsed)
                        self.debug(
                            'Using legacy mem_used for GPU {0}: {1}'.format(
                                gpuIdx, memUsed))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_used_" + gpuIdx)
                if data["device_load_gpu_" + gpuIdx] is None:
                    gpu_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][1]
                    try:
                        data["device_load_gpu_" + gpuIdx] = int(gpu_util)
                        self.debug(
                            'Using legacy load_gpu for GPU {0}: {1}'.format(
                                gpuIdx, gpu_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_gpu_" + gpuIdx)
                if data["device_load_mem_" + gpuIdx] is None:
                    mem_util = findall(
                        '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)',
                        output)[i][2]
                    try:
                        data["device_load_mem_" + gpuIdx] = int(mem_util)
                        self.debug(
                            'Using legacy load_mem for GPU {0}: {1}'.format(
                                gpuIdx, mem_util))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_load_mem_" + gpuIdx)
                if data["device_core_clock_" + gpuIdx] is None:
                    clock_core = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][1]
                    try:
                        data["device_core_clock_" + gpuIdx] = int(clock_core)
                        self.debug(
                            'Using legacy core_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_core))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_core_clock_" + gpuIdx)
                if data["device_mem_clock_" + gpuIdx] is None:
                    clock_mem = findall(
                        'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)',
                        output)[i][2]
                    try:
                        data["device_mem_clock_" + gpuIdx] = int(clock_mem)
                        self.debug(
                            'Using legacy mem_clock for GPU {0}: {1}'.format(
                                gpuIdx, clock_mem))
                    except Exception as e:
                        self.debug(str(e),
                                   "skipping device_mem_clock_" + gpuIdx)

        return data
Exemplo n.º 12
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        # Number of active GPUs
        self.gauge('nvml.gpus.number', deviceCount)
        for device_id in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetTemperature:{}'.format(err))
            # power info
            try:
                pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
                self.gauge('nvml.power.', pwr, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetPowerUsage:{}'.format(err))
            # fan info
            try:
                fan = pynvml.nvmlDeviceGetFanSpeed(handle)
                self.gauge('nvml.fan.', fan, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetFanSpeed:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % int(util_encoder[0]))
                self.gauge('nvml.util.encoder', int(
                    util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % int(util_decoder[0]))
                self.gauge('nvml.util.decoder', int(
                    util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
            # Clocks throttling info
            # Divide by the mask so that the value is either 0 or 1 per GPU
            try:
                throttle_reasons = (
                    pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle))
                self.gauge('nvml.throttle.appsettings', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting) /
                    pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting,
                    tags=d_tags)
                self.gauge('nvml.throttle.display', (throttle_reasons &
                    GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS) /
                    GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS,
                    tags=d_tags)
                self.gauge('nvml.throttle.hardware', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonHwSlowdown) /
                    pynvml.nvmlClocksThrottleReasonHwSlowdown,
                    tags=d_tags)
                self.gauge('nvml.throttle.idle', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonGpuIdle) /
                    pynvml.nvmlClocksThrottleReasonGpuIdle,
                    tags=d_tags)
                self.gauge('nvml.throttle.power.hardware', (throttle_reasons &
                    GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE) /
                    GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.power.software', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonSwPowerCap) /
                    pynvml.nvmlClocksThrottleReasonSwPowerCap,
                    tags=d_tags)
                self.gauge('nvml.throttle.syncboost', (throttle_reasons &
                    GPU_THROTTLE_SYNCBOOST) / GPU_THROTTLE_SYNCBOOST,
                    tags=d_tags)
                self.gauge('nvml.throttle.temp.hardware', (throttle_reasons &
                    GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE) /
                    GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.temp.software', (throttle_reasons &
                    GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE) /
                    GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.unknown', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonUnknown) /
                    pynvml.nvmlClocksThrottleReasonUnknown,
                    tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetCurrentClocksThrottleReasons:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = ','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = 'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Exemplo n.º 13
0
Arquivo: nvue.py Projeto: zyc4me/nvue
    mem_used = '{:<6}'.format(str(mem_info.used // mega) + 'M')
    show_str_lst.append('   ' + mem_used + '/' + mem_total)

    # 温度
    card_temp = ' ' + str(
        pml.nvmlDeviceGetTemperature(handle, pml.NVML_TEMPERATURE_GPU)) + 'C'
    show_str_lst.append(card_temp)

    # 利用率
    card_util_ratio = ' {:>3}'.format(
        pml.nvmlDeviceGetUtilizationRates(handle).gpu) + '%'
    show_str_lst.append(card_util_ratio)

    # 进程占用情况
    p_str = ''
    procs = pml.nvmlDeviceGetComputeRunningProcesses(handle)
    for j, p in enumerate(procs):
        #pid = '  ' + str(p.pid) + ' '
        pid = '{:<7}'.format(p.pid)
        p_name = bytes.decode(pml.nvmlSystemGetProcessName(p.pid))
        p_name = ' {:<10} '.format(p_name)
        p_mem_used = ' ' + str(p.usedGpuMemory // mega) + 'M'

        pc = psutil.Process(procs[0].pid)
        p_user = '******'.format(pc.username())

        p_str = ' ' + pid + p_name + p_user + p_mem_used

        if j == 0:
            show_str_lst.append(p_str)
        else:
Exemplo n.º 14
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # power info
            try:
                pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
                self.gauge('nvml.power.', pwr, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetPowerUsage:{}'.format(err))
            # fan info
            try:
                fan = pynvml.nvmlDeviceGetFanSpeed(handle)
                self.gauge('nvml.fan.', fan, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetFanSpeed:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(
                    util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(
                    util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Exemplo n.º 15
0
    def get_gpu_info(handle):
        """Get one GPU information specified by nvml handle"""
        def get_process_info(pid):
            """Get the process information of specific pid"""
            process = {}
            ps_process = psutil.Process(pid=pid)
            process['username'] = ps_process.username()
            # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
            _cmdline = ps_process.cmdline()
            if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                process['command'] = '?'
            else:
                process['command'] = os.path.basename(_cmdline[0])
            # Bytes to MBytes
            process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 /
                                              1024)
            process['pid'] = nv_process.pid
            return process

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        name = _decode(N.nvmlDeviceGetName(handle))
        uuid = _decode(N.nvmlDeviceGetUUID(handle))

        try:
            minor = int(N.nvmlDeviceGetMinorNumber(handle))
        except N.NVMLError:
            minor = None  # Not supported

        try:
            bus_id = _decode(N.nvmlDeviceGetPciInfo(handle).busId)
        except N.NVMLError:
            bus_id = None  # Not supported

        try:
            serial = _decode(N.nvmlDeviceGetSerial(handle))
        except N.NVMLError:
            serial = None  # Not supported

        try:
            temperature = N.nvmlDeviceGetTemperature(handle,
                                                     N.NVML_TEMPERATURE_GPU)
        except N.NVMLError:
            temperature = None  # Not supported

        try:
            memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
        except N.NVMLError:
            memory = None  # Not supported

        try:
            utilization = N.nvmlDeviceGetUtilizationRates(handle)
        except N.NVMLError:
            utilization = None  # Not supported

        try:
            power = N.nvmlDeviceGetPowerUsage(handle)
        except (N.NVMLError, N.NVMLError_NotSupported):
            power = None

        try:
            power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
        except (N.NVMLError, N.NVMLError_NotSupported):
            power_limit = None

        processes = []
        try:
            nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
        except N.NVMLError:
            nv_comp_processes = None  # Not supported
        try:
            nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(
                handle)
        except N.NVMLError:
            nv_graphics_processes = None  # Not supported

        if nv_comp_processes is None and nv_graphics_processes is None:
            processes = None  # Not supported (in both cases)
        else:
            nv_comp_processes = nv_comp_processes or []
            nv_graphics_processes = nv_graphics_processes or []
            for nv_process in (nv_comp_processes + nv_graphics_processes):
                # TODO: could be more information such as system memory usage,
                # CPU percentage, create time etc.
                try:
                    process = get_process_info(nv_process.pid)
                    processes.append(process)
                except psutil.NoSuchProcess:
                    # TODO: add some reminder for NVML broken context
                    # e.g. nvidia-smi reset  or  reboot the system
                    pass

        gpu_info = {
            'index': index,
            'uuid': uuid,
            'name': name,
            'minor': minor,
            'bus_id': bus_id,
            'serial': serial,
            'temperature_gpu': temperature,
            'utilization_gpu': utilization.gpu if utilization else None,
            'power_draw': int(power / 1000) if power is not None else None,
            'power_limit':
            int(power_limit / 1000) if power is not None else None,
            'memory_free': int(memory.free) if memory else None,
            'memory_used': int(memory.used) if memory else None,
            'memory_total': int(memory.total) if memory else None,
            'memory_utilization': utilization.memory if utilization else None,
            'processes': processes,
        }
        return gpu_info
Exemplo n.º 16
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(pid):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=pid)
                process['username'] = ps_process.username()
                # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = int(nv_process.usedGpuMemory /
                                                  1024 / 1024)
                process['pid'] = nv_process.pid

                # For docker
                cmd = 'cat /proc/{}/cgroup'.format(nv_process.pid)
                ret = subprocess.check_output(cmd.split())
                container_id = str(ret).split('/')[2][:12]
                process['container_id'] = container_id

                cmd = 'docker ps -a'
                ret = subprocess.check_output(cmd.split())
                docker_data = str(ret).split('\\n')[1:-1]
                for personal in docker_data:
                    personal_data = personal.split()
                    if container_id == personal_data[0]:
                        process['container_user_name'] = personal_data[-1]

                return process

            def _decode(b):
                if isinstance(b, bytes):
                    return b.decode()  # for python3, to unicode
                return b

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except:
                power_limit = None

            processes = []
            try:
                nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None  # Not supported (in both cases)
            else:
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in (nv_comp_processes + nv_graphics_processes):
                    # TODO: could be more information such as system memory usage,
                    # CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process.pid)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                int(power / 1000) if power is not None else None,
                'enforced.power.limit':
                int(power_limit / 1000) if power is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                int(memory.used / 1024 / 1024) if memory else None,
                'memory.total':
                int(memory.total / 1024 / 1024) if memory else None,
                'processes':
                processes,
            }
            return gpu_info
Exemplo n.º 17
0
Arquivo: core.py Projeto: tasksss/task
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()

                # _cmdline = ps_process.cmdline()
                # if not _cmdline:

                #     process['command'] = '?'
                #     process['full_command'] = ['?']
                # else:
                #     process['command'] = os.path.basename(_cmdline[0])
                #     process['full_command'] = _cmdline

                # process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                # process['cpu_percent'] = ps_process.cpu_percent()
                # process['cpu_memory_usage'] = \
                #     round((ps_process.memory_percent() / 100.0) *
                #           psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            # try:
            #     temperature = N.nvmlDeviceGetTemperature(
            #         handle, N.NVML_TEMPERATURE_GPU
            #     )
            # except N.NVMLError:
            #     temperature = None  # Not supported

            # try:
            #     fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            # except N.NVMLError:
            #     fan_speed = None  # Not supported

            # try:
            #     memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            # except N.NVMLError:
            #     memory = None  # Not supported

            # try:
            #     utilization = N.nvmlDeviceGetUtilizationRates(handle)
            # except N.NVMLError:
            #     utilization = None  # Not supported

            # try:
            #     power = N.nvmlDeviceGetPowerUsage(handle)
            # except N.NVMLError:
            #     power = None

            # try:
            #     power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            # except N.NVMLError:
            #     power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    # process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                # 'temperature.gpu': temperature,
                # 'fan.speed': fan_speed,
                # 'utilization.gpu': utilization.gpu if utilization else None,
                # 'power.draw': power // 1000 if power is not None else None,
                # 'enforced.power.limit': power_limit // 1000
                # if power_limit is not None else None,
                # Convert bytes into MBytes
                # 'memory.used': memory.used // MB if memory else None,
                # 'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Exemplo n.º 18
0
 def pids(handle):
     return [
         p.pid for p in pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
     ]
Exemplo n.º 19
0
            sysdata['ssd1_exist'] = False

        procs = deviceCount * [None]
        gpu_error = deviceCount * [False]
        for i in range(deviceCount):
            try:
                handle = nvmlDeviceGetHandleByIndex(i)
                name = nvmlDeviceGetName(handle)
                gpudata[i]['name'] = name.decode('utf-8')

                memInfo = nvmlDeviceGetMemoryInfo(handle)
                gpudata[i]['mem_free'] = toMB(memInfo.total - memInfo.used)
                gpudata[i]['mem_total'] = toMB(memInfo.total)
                gpudata[i]['mem_usage'] = memInfo.used / memInfo.total * 100

                procs_prefilter = nvmlDeviceGetComputeRunningProcesses(handle)
                # for unknown reasons, nvmlDeviceGetComputeRunningProcesses
                # sometimes returns nonexistent processes on 3090 GPUs
                procs[i] = []
                gpudata[i]['procs'] = []
                for p in procs_prefilter:
                    try:
                        P = psutil.Process(p.pid)
                        procs[i].append(p)
                        gpudata[i]['procs'].append((p.pid, ) + getprocinfo(P))
                    except psutil.NoSuchProcess:
                        pass
            except Exception as e:
                gpu_error[i] = True
                print('Unable to access GPU device (id: %d)' % i)
                print(e)
Exemplo n.º 20
0
        def get_gpu_info(handle, mig_handle=None):
            """Get one GPU information specified by nvml handle"""         

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]

                # TODO: ps_process is being cached, but the dict below is not.
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                          nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process
                        
            name = _decode(N.nvmlDeviceGetName(mig_handle if mig_handle else handle))
            uuid = _decode(N.nvmlDeviceGetUUID(mig_handle if mig_handle else handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError as e:
                log.add_exception("temperature", e)
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError as e:
                log.add_exception("fan_speed", e)
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(mig_handle if mig_handle else handle)  # in Bytes
            except N.NVMLError as e:
                log.add_exception("memory", e)
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError as e:
                log.add_exception("utilization", e)
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError as e:
                log.add_exception("utilization_enc", e)
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError as e:
                log.add_exception("utilization_dnc", e)
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError as e:
                log.add_exception("power", e)
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError as e:
                log.add_exception("power_limit", e)
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(mig_handle if mig_handle else handle)
            except N.NVMLError as e:
                log.add_exception("compute_processes", e)
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(mig_handle if mig_handle else handle)
            except N.NVMLError as e:
                log.add_exception("graphics_processes", e)
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                # A single process might run in both of graphics and compute mode,
                # However we will display the process only once
                seen_pids = set()
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    if nv_process.pid in seen_pids:
                        continue
                    seen_pids.add(nv_process.pid)
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        pass


                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = str(N.nvmlDeviceGetIndex(handle))
            if mig_handle:
                index += ':' + str(N.nvmlDeviceGetIndex(mig_handle))
            
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'fan.speed': fan_speed,
                'utilization.gpu': utilization.gpu if utilization else None,
                'utilization.enc':
                    utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                    utilization_dec[0] if utilization_dec else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
Exemplo n.º 21
0
def getprocs():
    for i in range(nv.nvmlDeviceGetCount()):
        hdl = nv.nvmlDeviceGetHandleByIndex(i)
        for p in nv.nvmlDeviceGetComputeRunningProcesses(hdl):
            yield p.pid
Exemplo n.º 22
0
import pynvml as nv
nv.nvmlInit()
for i in range(nv.nvmlDeviceGetCount()):
    hndl = nv.nvmlDeviceGetHandleByIndex(i)
    if not nv.nvmlDeviceGetComputeRunningProcesses(hndl):
        visable_device = str(i)
        break
nv.nvmlShutdown()

import tensorflow as tf
import os
try:
    os.environ["CUDA_VISIBLE_DEVICES"] = visable_device
except:
    print('No available gpu')
    exit()
import numpy as np
np.set_printoptions(precision=4, suppress=True, linewidth=100)
import tensorflow.contrib.layers as ly
from mnist import read_data_sets
from mnist import dense_to_one_hot

clabel = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
dataset = read_data_sets('MNIST_data',
                         one_hot=False,
                         validation_size=0,
                         clabel=clabel)

n_rdims = 19
n_classes = 20
n_features = 32
Exemplo n.º 23
0
        # power state
        power_used = nvmlDeviceGetPowerUsage(handle) / 1000
        power_limit = nvmlDeviceGetPowerManagementDefaultLimit(handle) / 1000
        power_used = int(power_used)
        power_limit = int(power_limit)
        power_rate = int(power_used / power_limit * 100)
        msg = pack_msg([power_used, power_limit], 'W')
        info['Power Util'] = getBar(power_rate, msg)

        # fan speed, temperature
        fan_speed = nvmlDeviceGetFanSpeed(handle)
        temp = nvmlDeviceGetTemperature(handle, 0)
        msg = f"{temp}C"
        info['Fan Speed'] = getBar(fan_speed, msg)

        message = [f"{k} \t{v}" for k, v in info.items()]
        print('\n'.join(message))

        # graphic processes
        graphic_processes = nvmlDeviceGetGraphicsRunningProcesses(handle)
        header = "\n=== Graphic Processes ==="
        show_process(header, graphic_processes)

        # graphic processes
        compute_processes = nvmlDeviceGetComputeRunningProcesses(handle)
        header = "\n=== Compute Processes ==="
        show_process(header, compute_processes)

    nvmlShutdown()
Exemplo n.º 24
-1
    def _summary(self):
        summary = []
        summary.append("GPU running Processes:")
        initGPU()
        try:
            gpusToUse = [int(n) for n in (self.gpusToUse.get()).split()]
            for i in gpusToUse:
                handle = nvmlDeviceGetHandleByIndex(i)
                cps = nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    # p_tags['pid'] = ps.pid
                    msg = " %d) " % i + psutil.Process(ps.pid).name()
                    msg += " (mem =%.2f MB)" % (float(ps.usedGpuMemory) /
                                                1048576.)
                    summary.append(msg)
        except NVMLError as err:
                summary.append(str(err))

        return summary