Пример #1
0
def count_devices() -> int:
    """
    Returns the number of available GPU devices installed on the host.

    Returns:
        int: The number of available devices.
    """
    query = smi.getInstance().DeviceQuery('count')
    if query:
        return int(query['count'])
    return 0
Пример #2
0
def devices_index() -> List[int]:
    """
    Returns an index list, containing the device index for each available GPU.

    Returns:
        list: A list with all available devices index.
    """
    query = smi.getInstance().DeviceQuery('index')
    if query:
        return list(range(len(query['gpu'])))
    return list()
Пример #3
0
def nvidia_driver_version() -> Tuple[Optional[int], Optional[int]]:
    """
    Returns the nvidia driver version.

    Returns:
        tuple: A tuple with major and minor driver version.
    """
    query = smi.getInstance().DeviceQuery('driver_version')
    if query:
        _version = query['driver_version'].split('.')
        return int(_version[0]), int(_version[1])
    return None, None
Пример #4
0
    def __init__(self) -> None:
        try:
            from pynvml.smi import nvidia_smi
        except ImportError:
            raise RuntimeError(
                "This contrib module requires pynvml to be installed. "
                "Please install it with command: \n pip install pynvml")
            # Let's check available devices
        if not torch.cuda.is_available():
            raise RuntimeError("This contrib module requires available GPU")

        # Let it fail if no libnvidia drivers or NMVL library found
        self.nvsmi = nvidia_smi.getInstance()
        super(GpuInfo, self).__init__()
Пример #5
0
def getDeviceInfo(require):
    if require == 'overview':
        nvmlInit()
        print("Driver Version:", nvmlSystemGetDriverVersion())
        deviceCount = nvmlDeviceGetCount()
        for i in range(deviceCount):
            handle = nvmlDeviceGetHandleByIndex(i)
            print("Device", i, ":", nvmlDeviceGetName(handle))

    elif require == 'nvsmi':
        nvsmi = nvidia_smi.getInstance()
        nvsmi = nvsmi.DeviceQuery('memory.free, memory.total')
        return nvsmi
        
    else:
        raise ValueError(require)
Пример #6
0
def main():
    app = connexion.App(__name__, specification_dir='./swagger/')
    app.app.json_encoder = encoder.JSONEncoder
    app.add_api('swagger.yaml',
                arguments={'title': 'midgard API'},
                pythonic_params=True)

    try:
        import pynvml
        from pynvml.smi import nvidia_smi
        app.app.nvsmi = nvidia_smi.getInstance()
    except Exception as e:
        flask_logger.error(
            "Failed to load NVML.  This node cannot produce GPU information",
            exc_info=True)
        app.app.nvsmi = None
    app.run(port=args.port)
Пример #7
0
def daemon_process(time_interval, json_path, gpu_index=0):
    gpu_memory_max = 0
    while True:
        nvsmi = nvidia_smi.getInstance()
        dictm = nvsmi.DeviceQuery('memory.free, memory.total')
        gpu_memory = dictm['gpu'][gpu_index]['fb_memory_usage'][
            'total'] - dictm['gpu'][gpu_index]['fb_memory_usage']['free']
        print("gpu_memory", gpu_memory)
        # if os.path.exists(json_path):
        #     with open(json_path)as f:
        #         js = json.load(f)
        # else:
        #     js = {
        #     'gpu_memory':[]
        #     }
        # with open(json_path, 'w')as f:
        #     #js['gpu_memory'] = gpu_memory_max
        #     js['gpu_memory'].append(gpu_memory)
        #     json.dump(js, f, indent=4)
        time.sleep(time_interval)
Пример #8
0
    def __init__(self, *args, **kwargs):
        super(GPUKernels, self).__init__(*args, **kwargs)

        self.nvsmi = nvidia_smi.getInstance()
Пример #9
0
def get_query_dict(filters: List[str]) -> Dict:
    """get_query_dict"""
    return smi.getInstance().DeviceQuery(', '.join(filters))
Пример #10
0
def getMemoryUsage():
    nvsmi = nvidia_smi.getInstance()
    usage = nvsmi.DeviceQuery("memory.used")["gpu"][0]["fb_memory_usage"]
    return "%d %s" % (usage["used"], usage["unit"])
Пример #11
0
from pynvml.smi import nvidia_smi
import psutil

config = toml.load('config.toml')

#申请内存大小
memory = config["MEMORY"]
#申请显存大小
v_memory = config["V_MEMORY"]

while(True):

    used_memory = 0
    used_memory = psutil.virtual_memory().used
    if(used_memory > memory * 1024 * 1024 * 1024):
        print("内存消耗大于申请量")
    nvsmi = nvidia_smi.getInstance()
    results = nvsmi.DeviceQuery('memory.used')
    ##{'gpu': [{'fb_memory_usage': {'used': 0.0625, 'unit': 'MiB'}}, {'fb_memory_usage': {'used': 0.0625, 'unit': 'MiB'}}]}

    used_v_memory = 0
    for item in results['gpu']:
        used = item['fb_memory_usage']['used']
        used_v_memory += used

    if(used_v_memory  > v_memory * 1024):
        print("显存消耗大于申请量")

    
    
Пример #12
0
    def gpu_mem():
        from pynvml.smi import nvidia_smi

        nvsmi = nvidia_smi.getInstance()
        return nvsmi.DeviceQuery("memory.free, memory.total")
Пример #13
0
 def __borrarMemoria(self):
     nvsmi = nvidia_smi.getInstance()
     nvsmi.DeviceQuery('memory.free, memory.total')
     torch.cuda.empty_cache()
     gc.collect()
Пример #14
0
def smi(request):
    return nvidia_smi.getInstance()
Пример #15
0
 def get_gpu_infos(self):
     nvsmi = nvidia_smi.getInstance()
     gpu_infos = nvsmi.DeviceQuery("index, uuid, name")
     self.logger.debug(f"Got device info from nvidia-smi: {gpu_infos}")
     return gpu_infos
Пример #16
0
    def _main_func():
      try:
        # first get name
        import torch as th
        import os
      except:
        self.P("ERROR: PyTorch not installed! Please install Pytorch.")
        return None

      nvsmires = None
      try:
        from pynvml.smi import nvidia_smi
        import pynvml
        nvsmi = nvidia_smi.getInstance()
        nvsmires = nvsmi.DeviceQuery('memory.free, memory.total, memory.used, utilization.gpu, temperature.gpu')
        pynvml_avail = True
      except:
        pynvml_avail = False

      lst_inf = []
      # now we iterate all devices
      n_gpus = th.cuda.device_count()
      if n_gpus > 0:
        th.cuda.empty_cache()
      current_pid_has_usage = False
      current_pid_gpus = []

      try:
        for device_id in range(n_gpus):
          dct_device = {}
          device_props = th.cuda.get_device_properties(device_id)
          dct_device['NAME'] = device_props.name
          dct_device['TOTAL_MEM'] = round(
            device_props.total_memory / 1024 ** (2 if mb else 3),
            2
          )
          mem_total = None
          mem_allocated = None
          gpu_used = None
          gpu_temp = None
          gpu_temp_max = None
          if pynvml_avail and nvsmires is not None and 'gpu' in nvsmires:
            dct_gpu = nvsmires['gpu'][device_id]
            mem_total = round(
              dct_gpu['fb_memory_usage']['total'] / (1 if mb else 1024),
              2
            )  # already from th
            mem_allocated = round(
              dct_gpu['fb_memory_usage']['used'] / (1 if mb else 1024),
              2
            )
            gpu_used = dct_gpu['utilization']['gpu_util']
            if isinstance(gpu_used, str):
              gpu_used = -1
            gpu_temp = dct_gpu['temperature']['gpu_temp']
            gpu_temp_max = dct_gpu['temperature']['gpu_temp_max_threshold']

            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            processes = []
            for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
              dct_proc_info = {k.upper(): v for k,v in proc.__dict__.items()}
              used_mem = dct_proc_info.pop('USEDGPUMEMORY', None)
              dct_proc_info['ALLOCATED_MEM'] = round(
                used_mem / 1024 ** (2 if mb else 3) if used_mem is not None else 0.0,
                2
              )
              processes.append(dct_proc_info)
              if dct_proc_info['PID'] == os.getpid():
                current_pid_has_usage = True
                current_pid_gpus.append(device_id)
            #endfor
            dct_device['PROCESSES'] = processes
            dct_device['USED_BY_PROCESS'] = device_id in current_pid_gpus
          else:
            str_os = platform.platform()
            ## check if platform is Tegra and record
            if 'tegra' in str_os.lower():
              # we just record the overall fre memory
              mem_total = self.get_machine_memory()
              mem_allocated = mem_total  - self.get_avail_memory()
              gpu_used = 1
              gpu_temp = 1
              gpu_temp_max = 100
              if not self._done_first_smi_error and nvsmires is not None:
                self.P("Running `gpu_info` on Tegra platform: {}".format(nvsmires), color='r')
                self._done_first_smi_error = True
            elif not self._done_first_smi_error:
              str_log = "ERROR: Please make sure you have both pytorch and pynvml in order to monitor the GPU"
              str_log += "\nError info: pynvml_avail={}, nvsmires={}".format(pynvml_avail, nvsmires)
              self.P(str_log)
              self._done_first_smi_error = True
          #endif
          dct_device['ALLOCATED_MEM'] = mem_allocated
          dct_device['FREE_MEM'] = -1
          if all(x is not None for x in [mem_total, mem_allocated]):
            dct_device['FREE_MEM'] = round(mem_total - mem_allocated,2)
          dct_device['MEM_UNIT'] = 'MB' if mb else 'GB'
          dct_device['GPU_USED'] = gpu_used
          dct_device['GPU_TEMP'] = gpu_temp
          dct_device['GPU_TEMP_MAX'] = gpu_temp_max

          lst_inf.append(dct_device)
        #end for all devices
      except Exception as e:
        self.P("gpu_info exception for device_id {}:\n{}".format(device_id, e), color='r')

      if show:
        self.P("GPU information for {} device(s):".format(len(lst_inf)), color='y')
        for dct_gpu in lst_inf:
          for k, v in dct_gpu.items():
            self.P("  {:<14} {}".format(k + ':', v), color='y')

      if current_pid and current_pid_has_usage:
        return [lst_inf[x] for x in current_pid_gpus]
      else:
        return lst_inf
Пример #17
0
 def query_gpu(*fields) -> List[Dict]:
     nvsmi = nvidia_smi.getInstance()
     gpu_infos = nvsmi.DeviceQuery(','.join(fields))
     return gpu_infos["gpu"]
Пример #18
0
def gpus_snap_info():
    nvsmi = nvidia_smi.getInstance()
    return nvsmi.DeviceQuery(
        "memory.free,memory.total,memory.used,compute-apps,temperature.gpu,driver_version,timestamp,name"
    )