예제 #1
0
def _get_gpu_usage(gpu_count):
    import pynvml
    gpus = []
    for i in range(gpu_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        try:
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
            memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
            temp = pynvml.nvmlDeviceGetTemperature(handle,
                                                   pynvml.NVML_TEMPERATURE_GPU)
            try:
                power_usage = (
                    pynvml.nvmlDeviceGetPowerUsage(handle) /
                    1000.0) / (pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) /
                               1000.0) * 100
            except pynvml.NVMLError as e:
                logger.error(
                    "Coudln't extract power usage due to NVML exception: {}".
                    format(str(e)))
                power_usage = -9999
            gpus.append(
                (handle, util.gpu, util.memory,
                 (memory.used / float(memory.total)) * 100, temp, power_usage))
        except pynvml.NVMLError as e:
            logger.error(
                "Coudln't extract gpu usage information due to NVML exception: {}"
                .format(str(e)))
            return None
    return gpus
예제 #2
0
파일: gpu.py 프로젝트: sanders41/codecarbon
def get_power_limit(handle):
    """Returns max power usage in milliwatts
    https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad
    """
    try:
        return pynvml.nvmlDeviceGetEnforcedPowerLimit(handle)
    except Exception:
        return None
예제 #3
0
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {
                    'gpu': util_gpu,
                    'memory': util_mem
                },
                'memory': {
                    'total': mem_total,
                    'free': mem_free,
                    'used': mem_used
                },
                'temperature': temperature,
                'power': {
                    'draw': power_draw,
                    'limit': power_limit
                }
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
예제 #4
0
def get_gpu_status(gpu_index=0):
    # init for getting
    N.nvmlInit()
    handle = N.nvmlDeviceGetHandleByIndex(gpu_index)

    def _decode(b):
        if isinstance(b, bytes):
            return b.decode()  # to unicode
        return b

    name = _decode(N.nvmlDeviceGetName(handle))
    uuid = _decode(N.nvmlDeviceGetUUID(handle))

    try:
        temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
    except N.NVMLError:
        temperature = None

    try:
        memory = N.nvmlDeviceGetMemoryInfo(handle)
    except N.NVMLError:
        memory = None

    try:
        utilization = N.nvmlDeviceGetUtilizationRates(handle)
    except N.NVMLError:
        utilization = None

    try:
        power = N.nvmlDeviceGetPowerUsage(handle)
    except:
        power = None

    try:
        power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
    except:
        power_limit = None

    # real gpu index
    index = N.nvmlDeviceGetIndex(handle)
    gpu_info = {
        'index': index,
        'uuid': uuid,
        'name': name,
        'temperature': temperature,
        'utilization': utilization.gpu if utilization else None,
        'power': int(power / 1000) if power is not None else None,
        'enforced.power': int(power_limit / 1000) if power_limit is not None else None,
        # Convert bytes into MBytes
        'memory.used': int(memory.used / 1024 / 1024) if memory else None,
        'memory.total': int(memory.total / 1024 / 1024) if memory else None,
    }
    # release resource
    N.nvmlShutdown()
    return GPUStat(gpu_info)
예제 #5
0
파일: sys_stat.py 프로젝트: Kesci/mwutils
    def stats(self):
        stats = {}
        for i in range(0, self.gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            try:
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                in_use_by_us = gpu_in_use_by_this_process(handle)

                stats["gpu.{}.{}".format(i, "gpu")] = util.gpu
                stats["gpu.{}.{}".format(i, "memory")] = util.memory
                stats["gpu.{}.{}".format(
                    i, "memoryAllocated")] = (memory.used /
                                              float(memory.total)) * 100
                stats["gpu.{}.{}".format(i, "temp")] = temp

                if in_use_by_us:
                    stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu
                    stats["gpu.process.{}.{}".format(i,
                                                     "memory")] = util.memory
                    stats["gpu.process.{}.{}".format(
                        i, "memoryAllocated")] = (memory.used /
                                                  float(memory.total)) * 100
                    stats["gpu.process.{}.{}".format(i, "temp")] = temp

                    # Some GPUs don't provide information about power usage
                try:
                    power_watts = pynvml.nvmlDeviceGetPowerUsage(
                        handle) / 1000.0
                    power_capacity_watts = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                        handle) / 1000.0
                    power_usage = (power_watts / power_capacity_watts) * 100

                    stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts
                    stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage

                    if in_use_by_us:
                        stats["gpu.process.{}.{}".format(
                            i, "powerWatts")] = power_watts
                        stats["gpu.process.{}.{}".format(
                            i, "powerPercent")] = power_usage

                except pynvml.NVMLError as err:
                    pass

            except pynvml.NVMLError as err:
                pass
        if psutil:
            #net = psutil.net_io_counters()
            sysmem = psutil.virtual_memory()
            stats["cpu"] = psutil.cpu_percent()
            stats["memory"] = sysmem.percent
        return stats
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {'gpu': util_gpu, 'memory': util_mem},
                'memory': {'total': mem_total, 'free': mem_free,
                           'used': mem_used},
                'temperature': temperature,
                'power': {'draw': power_draw, 'limit': power_limit}
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
예제 #7
0
def get_power(handle):
    power_usage = -1
    power_max = -1
    power_percent = -1
    try:
        # defaults to milliwatts
        power_usage = pynvml.nvmlDeviceGetPowerUsage(handle)
        power_usage = power_usage / 1000.
        # defaults to milliwatts
        power_max = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle)
        power_max = power_max / 1000

        power_percent = (float(power_usage) / power_max) * 100.
    except Exception:
        pass
    return power_usage, power_max, power_percent
예제 #8
0
def get_power(handle):
    power_usage = -1
    power_max = -1
    power_percent = -1
    try:
        # defaults to milliwatts
        power_usage = pynvml.nvmlDeviceGetPowerUsage(handle)
        power_usage = power_usage / 1000.
        # defaults to milliwatts
        power_max = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle)
        power_max = power_max / 1000

        power_percent = (float(power_usage) / power_max) * 100.
    except Exception:
        pass
    return power_usage, power_max, power_percent
예제 #9
0
파일: wgpu.py 프로젝트: Spiess/wgpu
def get_gpu_pid_info():
    """Retrieves the process IDs of processes running on the GPU."""

    gpus = []
    device_count = -1

    try:
        nvmlInit()

        device_count = nvmlDeviceGetCount()

        gpus = [{}] * device_count

        for i in range(device_count):
            gpus[i] = {'id': i}
            handle = nvmlDeviceGetHandleByIndex(i)
            device_name = nvmlDeviceGetName(handle)

            gpus[i]['name'] = device_name

            try:
                util = nvmlDeviceGetUtilizationRates(handle)
                gpus[i]['utilization'] = util.gpu
            except NVMLError as err:
                print(f'Error while reading GPU utilization for GPU {i}: {err}', file=sys.stderr)

            try:
                mem_info = nvmlDeviceGetMemoryInfo(handle)
                gpus[i]['mem_total'] = mem_info.total
                gpus[i]['mem_used'] = mem_info.used
            except NVMLError as err:
                print(f'Error while reading memory utilization for GPU {i}: {err}', file=sys.stderr)

            try:
                fan_speed = nvmlDeviceGetFanSpeed(handle)
                gpus[i]['fan_speed'] = fan_speed
            except NVMLError as err:
                print(f'Error while reading fan speed for GPU {i}: {err}', file=sys.stderr)

            try:
                temp = nvmlDeviceGetTemperature(handle, 0)
                gpus[i]['temp'] = temp
            except NVMLError as err:
                print(f'Error while reading temperature for GPU {i}: {err}', file=sys.stderr)

            try:
                power_usage = nvmlDeviceGetPowerUsage(handle)
                gpus[i]['power_usage'] = round(power_usage / 1000.)
            except NVMLError as err:
                print(f'Error while reading power usage for GPU {i}: {err}', file=sys.stderr)

            try:
                power_limit = nvmlDeviceGetEnforcedPowerLimit(handle)
                gpus[i]['power_limit'] = round(power_limit / 1000.)
            except NVMLError as err:
                print(f'Error while reading power limit for GPU {i}: {err}', file=sys.stderr)

            gpus[i]['processes'] = []

            try:
                processes = nvmlDeviceGetComputeRunningProcesses(handle)

                for process in processes:
                    process_name = nvmlSystemGetProcessName(process.pid).decode()
                    gpus[i]['processes'].append({'pid': process.pid, 'name': process_name})

            except NVMLError as err:
                print(f'Error while reading processes for GPU {i}: {err}', file=sys.stderr)

    except NVMLError as err:
        print(f'Error while reading GPU information: {err}', file=sys.stderr)

    nvmlShutdown()

    return gpus, device_count
예제 #10
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            return gpu_info
예제 #11
0
파일: core.py 프로젝트: vinthony/gpustat
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                          nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
예제 #12
0
파일: stats.py 프로젝트: leopd/wandb_client
    def stats(self):
        stats = {}
        for i in range(0, self.gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            try:
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                in_use_by_us = gpu_in_use_by_this_process(handle)

                stats["gpu.{}.{}".format(i, "gpu")] = util.gpu
                stats["gpu.{}.{}".format(i, "memory")] = util.memory
                stats["gpu.{}.{}".format(
                    i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100
                stats["gpu.{}.{}".format(i, "temp")] = temp

                if in_use_by_us:
                    stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu
                    stats["gpu.process.{}.{}".format(i, "memory")] = util.memory
                    stats["gpu.process.{}.{}".format(
                        i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100
                    stats["gpu.process.{}.{}".format(i, "temp")] = temp

                    # Some GPUs don't provide information about power usage
                try:
                    power_watts = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0
                    power_capacity_watts = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0
                    power_usage = (power_watts / power_capacity_watts) * 100

                    stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts
                    stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage

                    if in_use_by_us:
                        stats["gpu.process.{}.{}".format(i, "powerWatts")] = power_watts
                        stats["gpu.process.{}.{}".format(i, "powerPercent")] = power_usage

                except pynvml.NVMLError as err:
                    pass

            except pynvml.NVMLError as err:
                pass
        if psutil:
            net = psutil.net_io_counters()
            sysmem = psutil.virtual_memory()
            stats["cpu"] = psutil.cpu_percent()
            stats["memory"] = sysmem.percent
            stats["network"] = {
                "sent": net.bytes_sent - self.network_init["sent"],
                "recv": net.bytes_recv - self.network_init["recv"]
            }
            # TODO: maybe show other partitions, will likely need user to configure
            stats["disk"] = psutil.disk_usage('/').percent
            stats["proc.memory.availableMB"] = sysmem.available / 1048576.0
            try:
                stats["proc.memory.rssMB"] = self.proc.memory_info().rss / \
                    1048576.0
                stats["proc.memory.percent"] = self.proc.memory_percent()
                stats["proc.cpu.threads"] = self.proc.num_threads()
            except psutil.NoSuchProcess:
                pass
        return stats
예제 #13
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]

                # TODO: ps_process is being cached, but the dict below is not.
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = (nv_process.usedGpuMemory // MB if
                           nv_process.usedGpuMemory else None)
                process['gpu_memory_usage'] = usedmem
                # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem)
                process['cpu_percent'] = ps_process.cpu_percent()
                # process['cpu_memory_usage'] = "%d MiB" % (
                #     round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['cpu_memory_usage'] = (
                    round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                # A single process might run in both of graphics and compute mode,
                # However we will display the process only once
                seen_pids = set()
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    if nv_process.pid in seen_pids:
                        continue
                    seen_pids.add(nv_process.pid)
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    try:
                        process['cpu_percent'] = cache_process.cpu_percent()
                    except psutil.NoSuchProcess:
                        process['cpu_percent'] = 0.0
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        process['cpu_percent'] = 0.0
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'fan.speed': fan_speed,
                'utilization.gpu': utilization.gpu if utilization else 0,
                'utilization.enc':
                    utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                    utilization_dec[0] if utilization_dec else None,
                'power.draw': power // 1000 if power is not None else 0,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else 0,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else 0,
                'memory.total': memory.total // MB if memory else 0,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
예제 #14
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_last_used(index):
                last_useds = []
                if not os.path.exists('gpu_history.pkl'):
                    pickle.dump({}, open('gpu_history.pkl', 'wb'))
                with open('gpu_history.pkl', 'rb') as f:
                    history = pickle.load(f)
                    if platform.node() in history:
                        for user, last_used in history[
                                platform.node()][index].items():
                            # 1 day = 24 hours, 1 hour = 3600 seconds
                            used_before = (datetime.now() - last_used['last_used']).days * 24 + \
                                          (datetime.now() - last_used['last_used']).seconds / 3600
                            last_useds.append((user, used_before))
                        return last_useds
                    else:
                        return []

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            last_used = get_last_used(index)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
                'last_used':
                last_used,
            }
            GPUStatCollection.clean_processes()
            return gpu_info
예제 #15
0
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(pid):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=pid)
                process['username'] = ps_process.username()
                # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = int(nv_process.usedGpuMemory /
                                                  1024 / 1024)
                process['pid'] = nv_process.pid

                # For docker
                cmd = 'cat /proc/{}/cgroup'.format(nv_process.pid)
                ret = subprocess.check_output(cmd.split())
                container_id = str(ret).split('/')[2][:12]
                process['container_id'] = container_id

                cmd = 'docker ps -a'
                ret = subprocess.check_output(cmd.split())
                docker_data = str(ret).split('\\n')[1:-1]
                for personal in docker_data:
                    personal_data = personal.split()
                    if container_id == personal_data[0]:
                        process['container_user_name'] = personal_data[-1]

                return process

            def _decode(b):
                if isinstance(b, bytes):
                    return b.decode()  # for python3, to unicode
                return b

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except:
                power_limit = None

            processes = []
            try:
                nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None  # Not supported (in both cases)
            else:
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in (nv_comp_processes + nv_graphics_processes):
                    # TODO: could be more information such as system memory usage,
                    # CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process.pid)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                int(power / 1000) if power is not None else None,
                'enforced.power.limit':
                int(power_limit / 1000) if power is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                int(memory.used / 1024 / 1024) if memory else None,
                'memory.total':
                int(memory.total / 1024 / 1024) if memory else None,
                'processes':
                processes,
            }
            return gpu_info
예제 #16
0
    def get_gpu_info(handle):
        """Get one GPU information specified by nvml handle"""
        def get_process_info(pid):
            """Get the process information of specific pid"""
            process = {}
            ps_process = psutil.Process(pid=pid)
            process['username'] = ps_process.username()
            # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
            _cmdline = ps_process.cmdline()
            if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                process['command'] = '?'
            else:
                process['command'] = os.path.basename(_cmdline[0])
            # Bytes to MBytes
            process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 /
                                              1024)
            process['pid'] = nv_process.pid
            return process

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        name = _decode(N.nvmlDeviceGetName(handle))
        uuid = _decode(N.nvmlDeviceGetUUID(handle))

        try:
            minor = int(N.nvmlDeviceGetMinorNumber(handle))
        except N.NVMLError:
            minor = None  # Not supported

        try:
            bus_id = _decode(N.nvmlDeviceGetPciInfo(handle).busId)
        except N.NVMLError:
            bus_id = None  # Not supported

        try:
            serial = _decode(N.nvmlDeviceGetSerial(handle))
        except N.NVMLError:
            serial = None  # Not supported

        try:
            temperature = N.nvmlDeviceGetTemperature(handle,
                                                     N.NVML_TEMPERATURE_GPU)
        except N.NVMLError:
            temperature = None  # Not supported

        try:
            memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
        except N.NVMLError:
            memory = None  # Not supported

        try:
            utilization = N.nvmlDeviceGetUtilizationRates(handle)
        except N.NVMLError:
            utilization = None  # Not supported

        try:
            power = N.nvmlDeviceGetPowerUsage(handle)
        except (N.NVMLError, N.NVMLError_NotSupported):
            power = None

        try:
            power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
        except (N.NVMLError, N.NVMLError_NotSupported):
            power_limit = None

        processes = []
        try:
            nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
        except N.NVMLError:
            nv_comp_processes = None  # Not supported
        try:
            nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(
                handle)
        except N.NVMLError:
            nv_graphics_processes = None  # Not supported

        if nv_comp_processes is None and nv_graphics_processes is None:
            processes = None  # Not supported (in both cases)
        else:
            nv_comp_processes = nv_comp_processes or []
            nv_graphics_processes = nv_graphics_processes or []
            for nv_process in (nv_comp_processes + nv_graphics_processes):
                # TODO: could be more information such as system memory usage,
                # CPU percentage, create time etc.
                try:
                    process = get_process_info(nv_process.pid)
                    processes.append(process)
                except psutil.NoSuchProcess:
                    # TODO: add some reminder for NVML broken context
                    # e.g. nvidia-smi reset  or  reboot the system
                    pass

        gpu_info = {
            'index': index,
            'uuid': uuid,
            'name': name,
            'minor': minor,
            'bus_id': bus_id,
            'serial': serial,
            'temperature_gpu': temperature,
            'utilization_gpu': utilization.gpu if utilization else None,
            'power_draw': int(power / 1000) if power is not None else None,
            'power_limit':
            int(power_limit / 1000) if power is not None else None,
            'memory_free': int(memory.free) if memory else None,
            'memory_used': int(memory.used) if memory else None,
            'memory_total': int(memory.total) if memory else None,
            'memory_utilization': utilization.memory if utilization else None,
            'processes': processes,
        }
        return gpu_info
예제 #17
0
파일: core.py 프로젝트: wookayin/gpustat
        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info