示例#1
0
def gpu_info():
    "Returns a tuple of (GPU ID, GPU Description, GPU % Utilization)"
    nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    info = []
    for i in range(0, deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        util = nvmlDeviceGetUtilizationRates(handle)
        desc = nvmlDeviceGetName(handle)
        info.append(
            (i, desc, util.gpu))  #['GPU %i - %s' % (i, desc)] = util.gpu
    return info
示例#2
0
def test_nvidia_device(idx: int):
    from py3nvml import py3nvml as nvml

    handle = nvml.nvmlDeviceGetHandleByIndex(idx)

    pciInfo = nvml.nvmlDeviceGetPciInfo(handle)

    brands = {
        nvml.NVML_BRAND_UNKNOWN: "Unknown",
        nvml.NVML_BRAND_QUADRO: "Quadro",
        nvml.NVML_BRAND_TESLA: "Tesla",
        nvml.NVML_BRAND_NVS: "NVS",
        nvml.NVML_BRAND_GRID: "Grid",
        nvml.NVML_BRAND_GEFORCE: "GeForce"
    }

    inspect(
        idx=idx,
        # id=pciInfo.busId,
        # uuid=nvml.nvmlDeviceGetUUID(handle),
        name=nvml.nvmlDeviceGetName(handle),
        # brand=brands[nvml.nvmlDeviceGetBrand(handle)],
        # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle),
        # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle),
        fan=nvml.nvmlDeviceGetFanSpeed(handle),
        # power=nvml.nvmlDeviceGetPowerState(handle),
        mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total,
        mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used,
        util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu,
        # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory,
        temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU),
        power=nvml.nvmlDeviceGetPowerUsage(handle),
        power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle),

        # display=nvml.nvmlDeviceGetDisplayMode(handle),
        display_active=nvml.nvmlDeviceGetDisplayActive(handle),
    )

    logger.log()

    procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    logger.log()
示例#3
0
    def get_gpu_stats(self):
        """
        Return some statistics for the gpu associated with handle.

        The statistics returned are:
        - used memory in MB
        - gpu utilization percentage
        - temperature in Celsius degrees
        """
        mem = nvmlDeviceGetMemoryInfo(self.handle)
        rates = nvmlDeviceGetUtilizationRates(self.handle)
        temp = nvmlDeviceGetTemperature(self.handle, NVML_TEMPERATURE_GPU)

        return (mem.used / 1024 / 1024, rates.gpu, temp)
示例#4
0
    def get_device_utilization(self, idx):
        """Get the utilization of device.

        Args:
            idx (int): device index.

        Return:
            util (int): the utilization of device, None means failed to get the data.
        """
        try:
            util = nvml.nvmlDeviceGetUtilizationRates(
                self._device_handlers[idx])
        except Exception as err:
            logger.error('Get device utilization failed: {}'.format(str(err)))
            return None
        return util.gpu
示例#5
0
def gpustats():
    import py3nvml.py3nvml as pynvml

    if '__gpuhandler__' not in globals():
        globals()['__gpuhandler__'] = True
        pynvml.nvmlInit()

    usage = []
    util = []
    deviceCount = pynvml.nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        usage.append(info.used / info.total)
        info = pynvml.nvmlDeviceGetUtilizationRates(handle)
        util.append(info.gpu / 100.)

    return {'maxmemusage': max(usage), 'maxutil': max(util)}
 def get_card_usage_percent(self, card_index):
     # pylint: disable=no-member
     # pylint incorrectly detects that function nvmlDeviceGetUtilizationRates returns str
     return self.__nvml_get_or_else(lambda: float(
         nvmlDeviceGetUtilizationRates(
             nvmlDeviceGetHandleByIndex(card_index)).gpu))
示例#7
0
 def update(self):
     utilization = py3nvml.nvmlDeviceGetUtilizationRates(self.handle)
     self.gpu = utilization.gpu
     self.memory = utilization.memory
示例#8
0
    def get_stats(self):
        """
        Get system statistics and assign to `self`
        """
        memory_usage = psutil.virtual_memory()
        disk_usage = psutil.disk_usage('/')
        # net = psutil.net_io_counters()
        system = {
            # CPU utilization percent(can be over 100%)
            'cpu':
            round10e5(self._process.cpu_percent(0.0)),

            # Whole system memory usage
            # 'memory_used': round10e5(memory_usage.used / 1024 / 1024),
            'memory_percent':
            round10e5(memory_usage.used * 100 / memory_usage.total),

            # Get the portion of memory occupied by a process
            # 'p_memory_rss': round10e5(self._process.memory_info().rss
            #                           / 1024 / 1024),
            'p_memory_percent':
            round10e5(self._process.memory_percent()),

            # Disk usage
            # 'disk_used': round10e5(disk_usage.used / 1024 / 1024),
            'disk_percent':
            round10e5(disk_usage.percent),
        }

        # Collect GPU statistics
        gpus = []
        try:
            gpu_device_count = nvml.nvmlDeviceGetCount()
            for i in range(gpu_device_count):
                handle = nvml.nvmlDeviceGetHandleByIndex(i)
                nvml_tmp = nvml.NVML_TEMPERATURE_GPU

                # Get device memory and temperature
                util = nvml.nvmlDeviceGetUtilizationRates(handle)
                memory = nvml.nvmlDeviceGetMemoryInfo(handle)
                temp = nvml.nvmlDeviceGetTemperature(handle, nvml_tmp)

                # Compute power usage in watts and percent
                power_watts = nvml.nvmlDeviceGetPowerUsage(handle) / 1000
                power_cap = nvml.nvmlDeviceGetEnforcedPowerLimit(handle)
                power_cap_watts = power_cap / 1000
                power_watts / power_cap_watts * 100

                gpus.append({
                    # GPU utilization percent
                    'gpu':
                    round10e5(util.gpu),

                    # Device memory usage
                    # 'memory_used': round10e5(memory.used / 1024 / 1024),
                    'gpu_memory_percent':
                    round10e5(memory.used * 100 / memory.total),

                    # Power usage in watts and percent
                    'gpu_power_watts':
                    round10e5(power_watts),
                    # 'power_percent': round10e5(power_usage),

                    # Device temperature
                    'gpu_temp':
                    round10e5(temp),
                })
        except Exception:
            pass

        return system, gpus
 def usage_ratio(self, hnd):
     return nv.nvmlDeviceGetUtilizationRates(hnd).gpu
示例#10
0
def get_proc(device_handle):
    """Get GPU device CPU consumption in percent."""
    try:
        return pynvml.nvmlDeviceGetUtilizationRates(device_handle).gpu
    except pynvml.NVMLError:
        return None
    def get_gpu_info(handle):
        """Get one GPU information specified by nvml handle"""

        def get_process_info(nv_process):
            """Get the process information of specific pid"""
            process = {}
            ps_process = psutil.Process(pid=nv_process.pid)
            process['username'] = ps_process.username()
            # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
            _cmdline = ps_process.cmdline()
            if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                process['command'] = '?'
            else:
                process['command'] = os.path.basename(_cmdline[0])
            # Bytes to MBytes
            process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024)
            process['pid'] = nv_process.pid
            return process

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        name = _decode(N.nvmlDeviceGetName(handle))
        uuid = _decode(N.nvmlDeviceGetUUID(handle))

        try:
            temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
        except N.NVMLError:
            temperature = None  # Not supported

        try:
            memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
        except N.NVMLError:
            memory = None  # Not supported

        try:
            utilization = N.nvmlDeviceGetUtilizationRates(handle)
        except N.NVMLError:
            utilization = None  # Not supported

        try:
            power = N.nvmlDeviceGetPowerUsage(handle)
        except:
            power = None

        try:
            power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
        except:
            power_limit = None

        processes = []
        try:
            nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
        except N.NVMLError:
            nv_comp_processes = None  # Not supported
        try:
            nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(handle)
        except N.NVMLError:
            nv_graphics_processes = None  # Not supported

        if nv_comp_processes is None and nv_graphics_processes is None:
            processes = None  # Not supported (in both cases)
        else:
            nv_comp_processes = nv_comp_processes or []
            nv_graphics_processes = nv_graphics_processes or []
            for nv_process in (nv_comp_processes + nv_graphics_processes):
                # TODO: could be more information such as system memory usage,
                # CPU percentage, create time etc.
                try:
                    process = get_process_info(nv_process)
                    processes.append(process)
                except psutil.NoSuchProcess:
                    # TODO: add some reminder for NVML broken context
                    # e.g. nvidia-smi reset  or  reboot the system
                    pass

        index = N.nvmlDeviceGetIndex(handle)
        gpu_info = {
            'index': index,
            'uuid': uuid,
            'name': name,
            'temperature.gpu': temperature,
            'utilization.gpu': utilization.gpu if utilization else None,
            'power.draw': int(power / 1000) if power is not None else None,
            'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None,
            # Convert bytes into MBytes
            'memory.used': int(memory.used / 1024 / 1024) if memory else None,
            'memory.total': int(memory.total / 1024 / 1024) if memory else None,
            'processes': processes,
        }
        return gpu_info
    def live(self, callback):
        try:
            with Live(
                    self.layout,
                    refresh_per_second=2,
                    screen=False,
                    redirect_stderr=False,
                    redirect_stdout=False,
            ) as live:
                while True:
                    if self.stop_flag:
                        break

                    if callback:
                        callback(self)

                    if not self.resources_by_endpoint:
                        self.layout["endpoints"].update(
                            Panel(
                                Align.center(
                                    Text(
                                        "Waiting for endpoints to come alive"),
                                    vertical="middle",
                                )))
                    else:
                        self.endpoints_layout["data"].update(
                            EndpointMonitor(self.resources_by_endpoint))

                        self.endpoints_values = []
                        updated_keys = set()
                        max_value = 0
                        for (
                                endpoint_name,
                                endpoint_data,
                        ) in self.resources_by_endpoint.items():
                            updated_keys.add(endpoint_name)  # todo: finish

                            total_used = 0
                            total_max = 0
                            for entry in endpoint_data.values():
                                total_used += entry.used
                                total_max += entry.available

                            max_value = max(max_value, total_max)

                            # self.endpoints_values.append(data)
                            past_entries = self.endpoints_past_values.setdefault(
                                endpoint_name, [])
                            past_entries.append(total_used)

                            self.endpoints_values.append(past_entries)

                        self.endpoints_graph = AsciiGraph(
                            self.endpoints_values, max_value, BACKEND_COLORS)
                        self.endpoints_layout["graph"].update(
                            self.endpoints_graph)

                        self.layout["endpoints"].update(
                            Panel(self.endpoints_layout, title="Endpoints"))

                    uptime = datetime.datetime.now() - self.start_time
                    self.layout["header"]["info"].update(
                        Align.right(
                            Text(f"""Node ID: {self.node_id}
                                Uptime: {humanize.naturaldelta(uptime)}
                                https://discord.gg/94KqBcE"""),
                            vertical="middle",
                        ))

                    titles = []

                    table = Table.grid()
                    table.add_column(style="green")
                    table.add_column(no_wrap=True)

                    self.cpu_usage[0].append(psutil.cpu_percent(interval=None))
                    self.ram_usage[0].append(
                        int(round(psutil.virtual_memory().used / 1024**2)))

                    total_gpus_actual = py3nvml.nvmlDeviceGetCount()
                    for i in range(total_gpus_actual):
                        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
                        meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
                        utilization_info = py3nvml.nvmlDeviceGetUtilizationRates(
                            handle)

                        table.add_row(
                            py3nvml.nvmlDeviceGetName(handle),
                            str(round(meminfo.used / 1024**2)),
                        )
                        self.gpu_mem_usage[i].append(
                            round(meminfo.used / 1024**2))
                        self.gpu_usage[i].append(utilization_info.gpu)

                        color = RICH_COLORS[i]
                        titles.append(
                            f"[{color}]" + py3nvml.nvmlDeviceGetName(handle) +
                            f" {utilization_info.gpu}%, {humanize.naturalsize(meminfo.used)}/{humanize.naturalsize(meminfo.total)}"
                            + "[/]")

                    self.gpu_layout["utilization"].update(
                        self.gpu_usage_graph, )
                    self.gpu_layout["memory"].update(self.gpu_mem_usage_graph)

                    self.layout["gpu"].update(
                        Panel(self.gpu_layout, title=" ".join(titles)))

                    self.cpu_layout["utilization"].update(
                        Panel(self.cpu_usage_graph))
                    self.cpu_layout["memory"].update(
                        Panel(self.ram_usage_graph))

                    self.cpu_layout["utilization"].update(
                        self.cpu_usage_graph, )
                    self.cpu_layout["memory"].update(self.ram_usage_graph)

                    self.layout["cpu"].update(
                        Panel(self.cpu_layout, title=CPU_NAME))

                    self.layout["console"].update(self.tail)

                    sleep(1.0)
        except KeyboardInterrupt as e:
            py3nvml.nvmlShutdown()
            raise e
 def _get_utilisation_stats(gpu):
     util = pynvml.nvmlDeviceGetUtilizationRates(gpu)
     return {
         'utilisation_gpu_percent': util.gpu,
         'utilisation_memory_percent': util.memory
     }
示例#14
0
    def getGpuInfo(self):
        if (self._impulse % 2) != 0:
            return self._gpuInfoObj

        try:
            N.nvmlInit()
            gpuInfoObj = {}

            driverVersion = N.nvmlSystemGetDriverVersion()
            deviceCnt = N.nvmlDeviceGetCount()

            gpuInfoObj['DRIVER_VERSION'] = driverVersion
            gpuInfoObj['DEVICE_COUNT'] = deviceCnt

            for dCnt in range(deviceCnt):
                deviceInfoObj = {}
                handle = N.nvmlDeviceGetHandleByIndex(dCnt)
                name = N.nvmlDeviceGetName(handle)

                try:
                    fan = N.nvmlDeviceGetFanSpeed(handle)
                except N.NVMLError as err:
                    fan = 'N/A'

                try:
                    temp = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
                except N.NVMLError as err:
                    temp = 'N/A'

                try:
                    powerUsage = round(N.nvmlDeviceGetPowerUsage(handle) / 1000)
                except N.NVMLError as err:
                    powerUsage = 'N/A'

                try:
                    powerLimit = round(N.nvmlDeviceGetPowerManagementLimit(handle) / 1000)
                except N.NVMLError as err:
                    powerLimit = 'N/A'

                try:
                    memInfo = N.nvmlDeviceGetMemoryInfo(handle)
                    memUsage = round(memInfo.used/1024/1024)
                    memTotal = round(memInfo.total/1024/1024)
                except N.NVMLError as err:
                    memUsage = 'N/A'
                    memTotal = 'N/A'

                try:
                    util = N.nvmlDeviceGetUtilizationRates(handle).gpu
                except N.NVMLError as err:
                    util = 'N/A'

                deviceInfoObj['NAME'] = name
                deviceInfoObj['FAN'] = fan
                deviceInfoObj['TEMP'] = temp
                deviceInfoObj['POWER_USAGE'] = powerUsage
                deviceInfoObj['POWER_LIMIT'] = powerLimit
                deviceInfoObj['MEM_USAGE'] = memUsage
                deviceInfoObj['MEM_TOTAL'] = memTotal
                deviceInfoObj['UTIL'] = util

                gpuProcessObj = {}
                try:
                    processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
                except N.NVMLError as err:
                    processes = []
                for pCnt, process in enumerate(processes):
                    gpuMem = round(process.usedGpuMemory / 1024 / 1024)
                    pid = process.pid

                    try:
                        p = psutil.Process(pid)
                        attrs = p.as_dict(attrs = ['name', 'username', 'status'])
                    except psutil.ZombieProcess:
                        attrs = {'name': 'unknown', 'username': '******', 'status': 'zombie'}
                    except:
                        pass
                    
                    gpuProcessObj[str(pCnt)] = {
                        'PID': pid,
                        'MEM': gpuMem,
                        'NAME': attrs['name'],
                        'USERNAME': self._getSubuidName(attrs['username']),
                        'STATUS': attrs['status']
                    }

                deviceInfoObj['PROCESS'] = gpuProcessObj
                gpuInfoObj[str(dCnt)] = deviceInfoObj

            N.nvmlShutdown()

        except N.NVMLError as err:
            N.nvmlShutdown()
            print(err)
            gpuInfoObj = {}

        self._gpuInfoObj = gpuInfoObj
        return gpuInfoObj
示例#15
0
class NvidiaDeviceStatistics(Callback):
    reportable_values = dict(
        memory_total=lambda handle: _bytes_to_megabytes(
            nvml.nvmlDeviceGetMemoryInfo(handle).total),
        memory_used=lambda handle: _bytes_to_megabytes(
            nvml.nvmlDeviceGetMemoryInfo(handle).used),
        memory_free=lambda handle: _bytes_to_megabytes(
            nvml.nvmlDeviceGetMemoryInfo(handle).total - nvml.
            nvmlDeviceGetMemoryInfo(handle).used),
        temperature=lambda handle: nvml.nvmlDeviceGetTemperature(
            handle, nvml.NVML_TEMPERATURE_GPU),
        power_state=lambda handle: nvml.nvmlDeviceGetPowerState(handle),
        power_draw=lambda handle: nvml.nvmlDeviceGetPowerUsage(handle) /
        1000.0,
        utilization_gpu=lambda handle: nvml.nvmlDeviceGetUtilizationRates(
            handle).gpu,
        utilization_memory=lambda handle: nvml.nvmlDeviceGetUtilizationRates(
            handle).memory,
    )

    def __init__(self,
                 report=None,
                 devices=None,
                 quiet=False,
                 always_suffix=False,
                 output=print,
                 verbose_once=True):
        super(self.__class__, self).__init__()
        global nvml

        self.output = output

        if nvml is not None:
            try:
                nvml.nvmlInit()
            except (OSError, nvml.NVMLError_LibraryNotFound):
                # the python library might be installed, but not the drivers...
                nvml = None

        if nvml is None:
            if not quiet:
                self.output(
                    "Could not load py3nvml, cannot report any nvidia device statistics."
                )
            report = []
        else:
            device_count = nvml.nvmlDeviceGetCount()

            if devices is None:
                devices = list(range(device_count))
            else:
                devices = [
                    int(device) for device in devices
                    if 0 <= int(device) < device_count
                ]

            self.devices = devices
            self.deviceHandles = [
                nvml.nvmlDeviceGetHandleByIndex(device) for device in devices
            ]

            if not quiet:
                for n, handle in enumerate(self.deviceHandles):
                    self.output("Collecting statistics for device #% 2d: %s" %
                                (n, nvml.nvmlDeviceGetName(handle)))

        if report is None:
            report = ['temperature', 'utilization_gpu']
        elif report == 'all':
            report = list(self.reportable_values.keys())

        self.verbose_once = verbose_once
        self.report = report
        self.always_suffix = always_suffix

    def __del__(self):
        if nvml:
            try:
                nvml.nvmlShutdown()
            except Exception:
                pass

    def on_epoch_end(self, epoch, logs=None):
        for item in self.report:
            try:
                suffix = handle = None
                for n, handle in enumerate(self.deviceHandles):
                    if len(self.deviceHandles) == 1 and not self.always_suffix:
                        suffix = ''
                    else:
                        suffix = '_%02d' % (
                            n,
                        )  # TODO: this will not work nicely if more than 100 GPUs are in one sys

                    logs[item + suffix] = np.float32(
                        self.reportable_values[item](handle))
            except nvml.NVMLError as err:
                self.output("Error trying to read out value from NVML: %r" %
                            (err, ))
        if self.report and self.verbose_once:
            self.output("Current status for device #% 2d (%s): %r" %
                        (n, nvml.nvmlDeviceGetName(handle), {
                            what: float(call(handle))
                            for what, call in self.reportable_values.items()
                        }))
            self.verbose_once = False  # only print once