Пример #1
0
def get_running_pid_gpuid():
    """

    Partly from 
    https://github.com/wookayin/gpustat/blob/master/gpustat/core.py

    """

    pid_gpuid = []
    N.nvmlInit()
    device_count = N.nvmlDeviceGetCount()
    for index in range(device_count):
        handle = N.nvmlDeviceGetHandleByIndex(index)
        # Get Running Processes from NVML
        procs = []
        try:
            procs += N.nvmlDeviceGetComputeRunningProcesses(handle)
        except N.NVMLError:
            pass  # Not supported
        try:
            procs += N.nvmlDeviceGetGraphicsRunningProcesses(handle)
        except N.NVMLError:
            pass  # Not supported
        for proc in procs:
            pid_gpuid += [(proc.pid, index)]

    return pid_gpuid
Пример #2
0
def gpus_available() -> dict:
    try:
        nvmlInit()
        gpus = {}
        visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None)
        if visible_devices:
            visible_devices = {
                int(x.strip())
                for x in visible_devices.split(',')
            }
        else:
            visible_devices = list(range(nvmlDeviceGetCount()))
        for i, real_id in enumerate(visible_devices):
            h = nvmlDeviceGetHandleByIndex(real_id)
            info = nvmlDeviceGetMemoryInfo(h)
            total = info.total
            free = info.free
            ratio = free / total
            gpus[i] = ratio
            # print(f'total    : {info.total}')
            # print(f'free     : {info.free}')
            # print(f'used     : {info.used}')
            # t = torch.cuda.get_device_properties(0).total_memory
            # c = torch.cuda.memory_cached(0)
            # a = torch.cuda.memory_allocated(0)
            # print(t, c, a)
        nvmlShutdown()
        return gpus
    except Exception as e:
        logger.debug(f'Failed to get gpu info due to {e}')
        return {}
Пример #3
0
def admin_system():
    factor = 1073741824
    vmem = psutil.virtual_memory()
    ram = {
        "percent": vmem.percent,
        "used": round(vmem.used / factor, 2),
        "total": round(vmem.total / factor, 2)
    }  # GB

    hdd = psutil.disk_usage(app.config['USERSPACE_FOLDER'])
    disk_usage = {
        "percent": round((hdd.used / hdd.total) * 100, 2),
        "used": round(hdd.used / factor, 2),
        "total": round(hdd.total / factor, 2)
    }  # GB

    gpus = []
    pynvml.nvmlInit()
    for i in range(0, pynvml.nvmlDeviceGetCount()):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        resources = pynvml.nvmlDeviceGetUtilizationRates(handle)
        gpus.append({
            "id": i,
            "memory": resources.memory,
            "proc": resources.gpu
        })

    return render_template('system.admin.html.jinja2',
                           page_name='admin_system',
                           page_title='System',
                           ram=ram,
                           cpu=round(psutil.cpu_percent(), 2),
                           gpus=gpus,
                           disk_usage=disk_usage)
Пример #4
0
def get_statistics():
    """Get statistics for each GPU installed in the system."""
    nvmlInit()
    statistics = []

    try:
        count = nvmlDeviceGetCount()
        for i in range(count):
            handle = nvmlDeviceGetHandleByIndex(i)

            memory = nvmlDeviceGetMemoryInfo(handle)

            statistics.append({
                "gpu": i,
                "name": nvmlDeviceGetName(handle).decode("utf-8"),
                "memory": {
                    "total": _convert_kb_to_gb(int(memory.total)),
                    "used": _convert_kb_to_gb(int(memory.used)),
                    "utilisation": int(memory.used / memory.total * 100)
                },
            })
    except NVMLError as error:
        print(error)

    return statistics
Пример #5
0
def auto_select_gpu():
  """Select gpu which has largest free memory"""
  if HAS_NVML:
    pynvml.nvmlInit()
    deviceCount = pynvml.nvmlDeviceGetCount()
    largest_free_mem = 0
    largest_free_idx = 0
    for i in range(deviceCount):
      handle = pynvml.nvmlDeviceGetHandleByIndex(i)
      info = pynvml.nvmlDeviceGetMemoryInfo(handle)
      if info.free > largest_free_mem:
        largest_free_mem = info.free
        largest_free_idx = i
    pynvml.nvmlShutdown()
    largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB

    idx_to_gpu_id = {}
    for i in range(deviceCount):
      idx_to_gpu_id[i] = '{}'.format(i)

    gpu_id = idx_to_gpu_id[largest_free_idx]
    logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem))
    return gpu_id
  else:
    logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!')
    return '0'
Пример #6
0
def gpu_info() -> dict:
    info = dict()

    try:
        nvmlInit()
    except NVMLError:
        info['no-gpu'] = 'No Nvidia GPU detected'
        return info

    device_count = nvmlDeviceGetCount()

    info['driver_version'] = nvmlSystemGetDriverVersion().decode()
    info['device_count'] = device_count
    info['device'] = dict()
    for i in range(device_count):
        handle = nvmlDeviceGetHandleByIndex(i)
        memory = nvmlDeviceGetMemoryInfo(handle)

        info['device'][i] = dict()
        info['device'][i]['name'] = str(nvmlDeviceGetName(handle))

        info['device'][i]['memory'] = dict()

        info['device'][i]['memory']['total'] = str(size_in_gb(memory.total))

    nvmlShutdown()

    return info
Пример #7
0
def avg_gpu_info(measure_duration, print_info=False):
    """
    Input:
        measure_duration: int
    Output:
        avg_free_memory: numpy.array[int], len=gpu_count
        avg_gpu_util: numpy.array[int], len=gpu_count
    """
    # Get average gpu status
    pynvml.nvmlInit()  #初始化
    gpu_count = pynvml.nvmlDeviceGetCount()
    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(gpu_count)]
    avg_free_memory = [0.0] * gpu_count
    avg_gpu_util = [0.0] * gpu_count
    for _ in range(int(measure_duration)):
        for id, handle in enumerate(handles):
            avg_free_memory[id] = avg_free_memory[
                id] + pynvml.nvmlDeviceGetMemoryInfo(handle).free / 1e6
            avg_gpu_util[id] = avg_gpu_util[
                id] + pynvml.nvmlDeviceGetUtilizationRates(handle).gpu

        time.sleep(1)
    avg_free_memory = np.array(
        [int(memory / measure_duration) for memory in avg_free_memory])
    avg_gpu_util = np.array(
        [int(power / measure_duration) for power in avg_gpu_util])
    if print_info:
        present_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        logging.info(present_time)
        for gpu_id in range(gpu_count):
            gpu_info = 'GPU%d: gpu util:%d%% | free memory:%dMiB' % (
                gpu_id, avg_gpu_util[gpu_id], avg_free_memory[gpu_id])
            logging.info(gpu_info)
    return avg_free_memory, avg_gpu_util
Пример #8
0
    def __check_gpu(self):
        """ Check if the process list contains GPU processes and determine if
        GPUs exists. Add GPU processes to the processes list if required."""
        if not self.exp.meta_data.plugin_list._contains_gpu_processes():
            return

        try:
            import pynvml as pv
        except:
            logging.debug("pyNVML module not found")
            raise Exception("pyNVML module not found")

        try:
            pv.nvmlInit()
            count = int(pv.nvmlDeviceGetCount())
            logging.debug("%s GPUs have been found.", count)

            if not self.exp.meta_data.get('test_state'):
                for i in range(count):
                    handle = pv.nvmlDeviceGetHandleByIndex(i)
                    if pv.nvmlDeviceGetComputeRunningProcesses(handle):
                        raise Exception("Unfortunately, GPU %i is busy. Try \
                            resubmitting the job to the queue." % i)
        except Exception as e:
            raise Exception("Unable to run GPU plugins: %s", e.message)
        self.__set_gpu_processes(count)
Пример #9
0
def get_gpu_count() -> int:
    """
Special handling for detecting GPU availability: an approach
recommended by the NVIDIA RAPIDS engineering team, since `nvml`
bindings are difficult for Python libraries to keep updated.

This has the side-effect of importing the `cuDF` library, when
GPUs are available.

    returns:
count of available GPUs
    """
    try:
        import pynvml  # type: ignore  # pylint: disable=E0401
        pynvml.nvmlInit()

        gpu_count = pynvml.nvmlDeviceGetCount()

        if gpu_count > 0:
            import cudf  # type: ignore # pylint: disable=E0401,W0611,W0621
            # print(f"using {gpu_count} GPUs")

    except Exception:  # pylint: disable=W0703
        gpu_count = 0

    return gpu_count
Пример #10
0
 def __init__(self, pid=None, api=None, process_q=None, notify_q=None):
     try:
         pynvml.nvmlInit()
         self.gpu_count = pynvml.nvmlDeviceGetCount()
     except pynvml.NVMLError as err:
         self.gpu_count = 0
     #self.run = run
     self._pid = pid
     self._api = api
     self._interface = interface.BackendSender(
         process_queue=process_q,
         notify_queue=notify_q,
     )
     self.sampler = {}
     self.samples = 0
     self._shutdown = False
     if psutil:
         net = psutil.net_io_counters()
         self.network_init = {
             "sent": net.bytes_sent,
             "recv": net.bytes_recv
         }
     else:
         wandb.termlog(
             "psutil not installed, only GPU stats will be reported.  Install with pip install psutil"
         )
     self._thread = threading.Thread(target=self._thread_body)
     self._thread.daemon = True
Пример #11
0
def checkNVIDIA_GPU(xself=None):
    try:
        pynvml.nvmlInit()
        xself.gpuNumber = pynvml.nvmlDeviceGetCount()  #显示有几块GPU
        for i in range(xself.gpuNumber):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)  # 这里的0是GPU id
            meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            #print(meminfo.total) #显卡总的显存大小
            #print(meminfo.used)  #这里是字节bytes,所以要想得到以兆M为单位就需要除以1024**2
            #print(meminfo.free)  #显卡剩余显存大小
            m = int(meminfo.total * 100 / 1024**2) / 100

            xself.gpuInfo.append([i, m])
            if m >= 1:
                xself.gpuUsed = 1
                if m >= 2:
                    xself.face_recognition_mode = 'cnn'

            if m > xself.gpuMaxMemory:
                xself.gpuMaxMemory = m
    except:
        print(traceback.format_exc())

    print("\ngpu number=", xself.gpuNumber, '\nMaxMemory(GB)=',
          xself.gpuMaxMemory, '\nface recognition mode=',
          xself.face_recognition_mode, '\ngpu Use Batch=', xself.gpuUseBatch,
          "\n")
Пример #12
0
def MemoryStatus():
    try:
        phymem = psutil.virtual_memory()
        line = "\t... RAM usage: %5s%% %6s/%s" % (
            phymem.percent, str(int(phymem.used / 1024 / 1024)) + "M",
            str(int(phymem.total / 1024 / 1024)) +
            "M") + ";  CPU usage: {:0>4.1f}".format(
                psutil.cpu_percent(1)) + "%"
        print(line)

        pynvml.nvmlInit()
        gpuNumber = pynvml.nvmlDeviceGetCount()  #显示有几块GPU
        for i in range(gpuNumber):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)  # 这里的0是GPU id
            meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            #print(meminfo.total) #显卡总的显存大小
            #print(meminfo.used)  #这里是字节bytes,所以要想得到以兆M为单位就需要除以1024**2
            #print(meminfo.free)  #显卡剩余显存大小

            print('\t... GPU#' + str(i) + ': total memory',
                  int(meminfo.total / 1024**2), 'M, used',
                  int(meminfo.used / 1024**2), 'M ',
                  int(meminfo.used / meminfo.total * 10000) / 100, '%, free',
                  int(meminfo.free / 1024**2), 'M\n')
    except:
        pass
Пример #13
0
def get_freer_gpu(by="n_proc"):
    """Return the GPU index which has the largest avaiable memory

    Returns:
        int: the index of selected GPU.
    """

    if os.environ.get("CUDA_DEVICE_ORDER", None) != "PCI_BUS_ID":
        raise RuntimeError("Need CUDA_DEVICE_ORDER=PCI_BUS_ID to ensure "
                           "consistent ID")

    from pynvml import (
        nvmlInit,
        nvmlDeviceGetCount,
        nvmlDeviceGetHandleByIndex,
        nvmlDeviceGetComputeRunningProcesses,
        nvmlDeviceGetMemoryInfo,
    )

    nvmlInit()
    n_devices = nvmlDeviceGetCount()
    gpu_id, gpu_state = None, None
    for i in range(0, n_devices):
        handle = nvmlDeviceGetHandleByIndex(i)
        if by == "n_proc":
            temp = -len(nvmlDeviceGetComputeRunningProcesses(handle))
        elif by == "free_mem":
            temp = nvmlDeviceGetMemoryInfo(handle).free
        else:
            raise ValueError("`by` can only be 'n_proc', 'free_mem'.")
        if gpu_id is None or gpu_state < temp:
            gpu_id, gpu_state = i, temp

    return gpu_id
Пример #14
0
def write_initial_gpu_info(redis_con: redis.Redis) -> None:
    """Write initial GPU information into database that are only required to be requested once on process startup.
    This is done to reduce the time requesting NVIDIAs interface by only requesting changing attributes. The following
    "static" attributes are requested in this method: name, total memory

    :param redis_con: an instance of Redis connection
    """
    redis_con.delete(RKEY_HWINFO_GPU_NAME)
    redis_con.delete(RKEY_HWINFO_GPU_MEM_AVAIL)
    if GPU_MODE:
        gpu_count = pynvml.nvmlDeviceGetCount()
        for i in range(16):
            if i >= gpu_count:
                redis_con.delete(RKEY_HWINFO_GPU_UTIL.format(i))
                redis_con.delete(RKEY_HWINFO_GPU_MEM_USED.format(i))
                redis_con.delete(RKEY_HWINFO_GPU_TEMP.format(i))
        for i in range(gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            try:
                redis_con.lpush(RKEY_HWINFO_GPU_NAME, pynvml.nvmlDeviceGetName(handle).decode("utf-8"))
            except pynvml.NVMLError:
                redis_con.lpush(RKEY_HWINFO_GPU_NAME, "ERROR")
            try:
                redis_con.lpush(RKEY_HWINFO_GPU_MEM_AVAIL, pynvml.nvmlDeviceGetMemoryInfo(handle).total)
            except pynvml.NVMLError:
                redis_con.lpush(RKEY_HWINFO_GPU_MEM_AVAIL, -1)
Пример #15
0
def _update_nvml_static_info():
    driver_version = pynvml.nvmlSystemGetDriverVersion().decode()
    nvml_version = pynvml.nvmlSystemGetNVMLVersion().decode()
    device_count = pynvml.nvmlDeviceGetCount()
    devices = []
    devices_handles = []
    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle).decode()
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        devices.append({
            'index': i,
            'name': name,
            'memory': {
                'total': mem_info.total
            }
        })
        devices_handles.append(handle)
    _static_info['public'].update({
        'gpu': {
            'driver': driver_version,
            'nvml': nvml_version,
            'devices': devices
        }
    })
    _static_info['private'].update({'gpu': {'handles': devices_handles}})
Пример #16
0
    def __check_gpu(self):
        """ Check if the process list contains GPU processes and determine if
        GPUs exists. Add GPU processes to the processes list if required."""
        if not self.exp.meta_data.plugin_list._contains_gpu_processes():
            return

        try:
            import pynvml as pv
        except:
            logging.debug("pyNVML module not found")
            raise Exception("pyNVML module not found")

        try:
            pv.nvmlInit()
            count = int(pv.nvmlDeviceGetCount())
            logging.debug("%s GPUs have been found.", count)

            if not self.exp.meta_data.get('test_state'):
                for i in range(count):
                    handle = pv.nvmlDeviceGetHandleByIndex(i)
                    if pv.nvmlDeviceGetComputeRunningProcesses(handle):
                        raise Exception("Unfortunately, GPU %i is busy. Try \
                            resubmitting the job to the queue." % i)
        except Exception as e:
            raise Exception("Unable to run GPU plugins: %s", e.message)
        self.__set_gpu_processes(count)
Пример #17
0
    def __init__(self, *, set_cuda_visible=True):
        """ 获得各个gpu内存使用信息

        :param set_cuda_visible: 是否根据 环境变量 CUDA_VISIBLE_DEVICES 重新计算gpu的相对编号
        """
        import pynvml

        records = []
        columns = ['origin_id',  # 原始id编号
                   'total',  # 总内存
                   'used',  # 已使用
                   'free']  # 剩余空间

        try:
            # 1 初始化
            pynvml.nvmlInit()

            # 2 每张gpu卡的绝对、相对编号
            if set_cuda_visible and 'CUDA_VISIBLE_DEVICES' in os.environ:
                idxs = re.findall(r'\d+', os.environ['CUDA_VISIBLE_DEVICES'])
                idxs = [int(v) for v in idxs]
            else:
                cuda_num = pynvml.nvmlDeviceGetCount()
                idxs = list(range(cuda_num))  # 如果不限定,则获得所有卡的信息

            # 3 获取每张候选gpu卡的内存使用情况
            for i in idxs:
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                records.append([i, meminfo.total, meminfo.used, meminfo.free])
        except (FileNotFoundError, pynvml.nvml.NVMLError_LibraryNotFound) as e:
            # 注意,找不到nvml.dll文件,不代表没有gpu卡~
            pass

        self.stat = pd.DataFrame.from_records(records, columns=columns)
Пример #18
0
def gpus_available() -> Dict[int, float]:
    if not torch.cuda.is_available():
        return dict()
    try:
        nvmlInit()
        gpus = {}
        visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None)
        if visible_devices is None:
            visible_devices = list(range(nvmlDeviceGetCount()))
        else:
            visible_devices = {int(x.strip()) for x in visible_devices.split(',')}
        for i, real_id in enumerate(visible_devices):
            h = nvmlDeviceGetHandleByIndex(real_id)
            info = nvmlDeviceGetMemoryInfo(h)
            total = info.total
            free = info.free
            ratio = free / total
            gpus[i] = ratio
            # print(f'total    : {info.total}')
            # print(f'free     : {info.free}')
            # print(f'used     : {info.used}')
            # t = torch.cuda.get_device_properties(0).total_memory
            # c = torch.cuda.memory_cached(0)
            # a = torch.cuda.memory_allocated(0)
            # print(t, c, a)
        nvmlShutdown()
        return dict(sorted(gpus.items(), key=lambda x: x[1], reverse=True))
    except Exception as e:
        logger.debug(f'Failed to get gpu info due to {e}')
        return dict((i, 1.0) for i in range(torch.cuda.device_count()))
Пример #19
0
def get_device_util(device_id=None):
    if device_id is not None:
        return _get_device_util(device_id)
    else:
        return [
            _get_device_util(i) for i in range(pynvml.nvmlDeviceGetCount())
        ]
Пример #20
0
def get_device_free(device_id: int = None):
    if device_id is not None:
        return _get_device_free(device_id)
    else:
        return [
            _get_device_free(i) for i in range(pynvml.nvmlDeviceGetCount())
        ]
Пример #21
0
def test_get_all_names():
    device_count = m.nvmlDeviceGetCount()
    for index in range(device_count):
        handle = m.nvmlDeviceGetHandleByIndex(index)
        name = m.nvmlDeviceGetName(handle)
        assert isinstance(name, string_types)
        assert len(name) > 0
Пример #22
0
def auto_select_gpu():
    """Select gpu which has largest free memory"""
    if HAS_NVML:
        pynvml.nvmlInit()
        deviceCount = pynvml.nvmlDeviceGetCount()
        largest_free_mem = 0
        largest_free_idx = 0
        for i in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            if info.free > largest_free_mem:
                largest_free_mem = info.free
                largest_free_idx = i
        pynvml.nvmlShutdown()
        largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB

        idx_to_gpu_id = {}
        for i in range(deviceCount):
            idx_to_gpu_id[i] = '{}'.format(i)

        gpu_id = idx_to_gpu_id[largest_free_idx]
        logging.info(
            'Using largest free memory GPU {} with free memory {}MB'.format(
                gpu_id, largest_free_mem))
        return gpu_id
    else:
        logging.info(
            'nvidia-ml-py is not installed, automatically select gpu is disabled!'
        )
        return '0'
Пример #23
0
    def __check_gpu(self):
        """ Check if the process list contains GPU processes and determine if
        GPUs exists. Add GPU processes to the processes list if required."""
        if not self.exp.meta_data.plugin_list._contains_gpu_processes():
            return
        try:
            import pynvml as pv
        except:
            logging.debug("pyNVML module not found")
            raise Exception("pyNVML module not found")
        try:
            pv.nvmlInit()
            count = int(pv.nvmlDeviceGetCount())
            logging.debug("%s GPUs have been found.", count)
        except:
            logging.debug("No GPUs have been found.")
            raise Exception("The process list contains GPU plugins, but "
                            " no GPUs have been found.")

        processes = self.exp.meta_data.get_meta_data('processes')
        if not [i for i in processes if 'GPU' in i]:
            logging.debug("GPU processes missing. GPUs found so adding them.")
            cpus = ['CPU'+str(i) for i in range(count)]
            gpus = ['GPU'+str(i) for i in range(count)]
            for i in range(min(count, len(processes))):
                processes[processes.index(cpus[i])] = gpus[i]
            self.exp.meta_data.set_meta_data('processes', processes)
Пример #24
0
def temperatures():
    ret = {}
    for i in range(nv.nvmlDeviceGetCount()):
        hdl = nv.nvmlDeviceGetHandleByIndex(i)
        temp = nv.nvmlDeviceGetTemperature(hdl, nv.NVML_TEMPERATURE_GPU)
        ret[i] = temp
    return ret
Пример #25
0
def autoset_nvgpu(metric="memory", k=1):
    """autoset_nvgpu
    automatically set NVIDIA GPU device

    Args:
        metric (str): memory/utilization
            select the GPU with min(metric)
        k (int): num. of selected devices
    """
    pynvml.nvmlInit()
    gpunum = pynvml.nvmlDeviceGetCount()
    assert (k <= gpunum)
    metric_list = []
    for idx in range(gpunum):
        handle = pynvml.nvmlDeviceGetHandleByIndex(idx)

        if metric in ["util", "utilization"]:
            util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
            metric_list.append((util_rate, idx))
        else:
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            mem_use_rate = 1.0 - mem_info.free / mem_info.total
            metric_list.append((mem_use_rate, idx))
    # sort the devices with ascending metric
    metric_list = sorted(metric_list, key=lambda x: x[0])
    selected_idx = [str(x[1]) for x in metric_list[:k]]
    # set the visible devices
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(selected_idx)
Пример #26
0
def getFreeId():
    import pynvml

    pynvml.nvmlInit()

    def getFreeRatio(id):
        handle = pynvml.nvmlDeviceGetHandleByIndex(id)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        # print("Memory Total: ",info.total/(1024*1024))
        # print("Memory Free: ",info.free/(1024*1024))
        # print("Memory Used: ",info.used/(1024*1024))
        use = pynvml.nvmlDeviceGetUtilizationRates(handle)
        ratio = 0.5 * (float(use.gpu + float(use.memory)))
        return ratio

    deviceCount = pynvml.nvmlDeviceGetCount()
    available = []
    for i in range(deviceCount):
        if getFreeRatio(i) < 70:
            available.append(i)
    gpus = ''
    for g in available:
        gpus = gpus + str(g) + ','
    gpus = gpus[:-1]
    return gpus
Пример #27
0
def getGPUstate():
    """
    pip install nvidia-ml-py3
    :return:返回一个数组,数组长度为GPU的个数
    """
    meminfo = {}
    infoStr = ""
    try:
        pynvml.nvmlInit()
        devicecount = pynvml.nvmlDeviceGetCount()
        for num in range(devicecount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(num)
            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            meminfo[
                num] = "Device: {} , {} / {} {:.2f}%, free memory:{}".format(
                    num, info.used, info.total, info.used / info.total * 100,
                    info.free)
        for i in range(len(meminfo)):
            infoStr += meminfo[i] + "\n"
        #mainlog(infoStr,'info')
        return infoStr
    except Exception as e:
        mainlog(e, 'error')
        #print("error happen in getGPUstate:"+str(e))
        return "出现错误 Error:" + str(e)
Пример #28
0
def get_num_gpus():
    import pynvml

    pynvml.nvmlInit()
    ngpus = pynvml.nvmlDeviceGetCount()
    pynvml.nvmlShutdown()
    return ngpus
Пример #29
0
    def collect_metrics(self):
        """
    Collect NVIDIA GPU metrics (eg: Temperature, Power-Consumption, fan-speed, etc.)
    """
        data_list = []
        for gpu_num in range(nvmlDeviceGetCount()):
            handle = nvmlDeviceGetHandleByIndex(gpu_num)
            device_name = DEVICE_NAME_FORMAT % gpu_num
            power_usage = float(nvmlDeviceGetPowerUsage(handle)) / 1000.0
            fan_speed = nvmlDeviceGetFanSpeed(handle)
            temperature = nvmlDeviceGetTemperature(handle,
                                                   NVML_TEMPERATURE_GPU)
            data_list.append({
                'measurement': device_name,
                'tags': {
                    'host': 'minar',
                    'gpu': device_name
                },
                'fields': {
                    'power_usage': power_usage,
                    'fan_speed': fan_speed,
                    'temperature': temperature
                }
            })
            time.sleep(PERIOD_SECS)

        return data_list
Пример #30
0
 def nvml_reinit(self) -> None:
     self.__handle = []
     with contextlib.suppress(Exception):
         pynvml.nvmlInit()
         self.__ngpus = pynvml.nvmlDeviceGetCount()
         for i in range(self.__ngpus):
             self.__handle.append(pynvml.nvmlDeviceGetHandleByIndex(i))
Пример #31
0
def _pynvml_handles():
    global handles
    if handles is None:
        pynvml.nvmlInit()
        count = pynvml.nvmlDeviceGetCount()
        handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(count)]
    return handles
Пример #32
0
def get_free_device_ids(threshold=0.5, device_ids=None):
    device_ids = set(device_ids) if device_ids else set(
        range(pynvml.nvmlDeviceGetCount()))
    return [
        _id for _id, _f in get_device_free()
        if _f >= threshold and _id in device_ids
    ]
Пример #33
0
    def __check_gpu(self):
        """ Check if the process list contains GPU processes and determine if
        GPUs exists. Add GPU processes to the processes list if required."""
        if not self.exp.meta_data.plugin_list._contains_gpu_processes():
            return
        try:
            import pynvml as pv
        except:
            logging.debug("pyNVML module not found")
            raise Exception("pyNVML module not found")
        try:
            pv.nvmlInit()
            count = int(pv.nvmlDeviceGetCount())
            logging.debug("%s GPUs have been found.", count)
        except:
            logging.debug("No GPUs have been found.")
            raise Exception("The process list contains GPU plugins, but "
                            " no GPUs have been found.")

        processes = self.exp.meta_data.get_meta_data('processes')
        if not [i for i in processes if 'GPU' in i]:
            logging.debug("GPU processes missing. GPUs found so adding them.")
            cpus = ['CPU' + str(i) for i in range(count)]
            gpus = ['GPU' + str(i) for i in range(count)]
            for i in range(min(count, len(processes))):
                processes[processes.index(cpus[i])] = gpus[i]
            self.exp.meta_data.set_meta_data('processes', processes)
Пример #34
0
def get_gpu_temperatures():
    nvmlInit()
    gpus = dict()
    for i in range(nvmlDeviceGetCount()):
        handle = nvmlDeviceGetHandleByIndex(i)
        gpus[i] = int(nvmlDeviceGetTemperature(handle, 0))

    nvmlShutdown()
    return gpus
Пример #35
0
def get_available_gpus():
    try:
        import pynvml as pv
    except:
        logging.debug("pyNVML module not found")
        raise Exception("pyNVML module not found")
    pv.nvmlInit()
    count = int(pv.nvmlDeviceGetCount())
    return pv, count
Пример #36
0
 def get_device_count(self):
     """ Return count of Nvidia devices """
     if IS_MACOS:
         self.device_count = pynvx.cudaDeviceGetCount(ignore=True)
     else:
         try:
             self.device_count = pynvml.nvmlDeviceGetCount()
         except pynvml.NVMLError:
             self.device_count = 0
     if self.logger:
         self.logger.debug("GPU Device count: %s", self.device_count)
    def _crawl_in_system(self):
        '''
        nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE,
        POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS,
        PAGE_RETIREMENT, ACCOUNTING

        currently, following are requested based on dlaas requirements:
            utilization.gpu, utilization.memory,
            memory.total, memory.free, memory.used
        nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\
            memory.total,memory.free,memory.used --format=csv,noheader,nounits
        '''

        if self._init_nvml() == -1:
            return

        self.inspect_arr = exec_dockerps()

        num_gpus = pynvml.nvmlDeviceGetCount()

        for gpuid in range(0, num_gpus):
            gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid)
            temperature = pynvml.nvmlDeviceGetTemperature(
                gpuhandle, pynvml.NVML_TEMPERATURE_GPU)
            memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle)
            mem_total = memory.total / 1024 / 1024
            mem_used = memory.used / 1024 / 1024
            mem_free = memory.free / 1024 / 1024
            power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000
            power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit(
                gpuhandle) / 1000
            util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle)
            util_gpu = util.gpu
            util_mem = util.memory
            entry = {
                'utilization': {'gpu': util_gpu, 'memory': util_mem},
                'memory': {'total': mem_total, 'free': mem_free,
                           'used': mem_used},
                'temperature': temperature,
                'power': {'draw': power_draw, 'limit': power_limit}
            }
            key = self._get_feature_key(gpuhandle, gpuid)
            if gpuid == num_gpus - 1:
                self._shutdown_nvml()

            yield (key, entry, 'gpu')

        return
Пример #38
0
def getFreeId():
    import pynvml 

    pynvml.nvmlInit()
    def getFreeRatio(id):
        handle = pynvml.nvmlDeviceGetHandleByIndex(id)
        use = pynvml.nvmlDeviceGetUtilizationRates(handle)
        ratio = 0.5*(float(use.gpu+float(use.memory)))
        return ratio

    deviceCount = pynvml.nvmlDeviceGetCount()
    available = []
    for i in range(deviceCount):
        if getFreeRatio(i)<70:
            available.append(i)
    gpus = ''
    for g in available:
        gpus = gpus+str(g)+','
    gpus = gpus[:-1]
    return gpus
Пример #39
0
    def collect_via_pynvml(self, stats_config):
        """
        Use pynvml python binding to collect metrics
        :param stats_config:
        :return:
        """
        try:
            NVML_TEMPERATURE_GPU = 0
            pynvml.nvmlInit()
            device_count = pynvml.nvmlDeviceGetCount()

            for device_index in xrange(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
                memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle)

                metrics = {
                    'memory.total': memoryInfo.total / 1024 / 1024,
                    'memory.used': memoryInfo.total / 1024 / 1024,
                    'memory.free': memoryInfo.free / 1024 / 1024,
                    'utilization.gpu': utilizationRates.gpu,
                    'utilization.memory': utilizationRates.memory,
                    'temperature.gpu':
                        pynvml.nvmlDeviceGetTemperature(handle,
                                                        NVML_TEMPERATURE_GPU)
                }

                for stat_name in stats_config[1:]:
                    metric = metrics.get(stat_name)
                    if metric:
                        metric_name = 'gpu_{index}.{stat_name}'.format(
                            index=str(device_index),
                            stat_name=stat_name
                        )
                        self.publish(metric_name, metric)
        finally:
            pynvml.nvmlShutdown()
Пример #40
0
 def init(self):
     
     self.util_history = []
     self.temp_history = []
     pynvml.nvmlInit()
     self.gpu_handles = []
     self.deviceCount = pynvml.nvmlDeviceGetCount()
     
     for i in range(self.deviceCount):
         self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i))
     
     self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6)
     self.cpu_prog_bars = []
     self.gpu_boxes = []
     self.gpu_prog_bars = []
     
     self.prev_idle = []
     self.prev_total = []
     self.idle = []
     self.total = []
     
     #---cpu_box---
     try:
         stat = open("/proc/stat")
         
         statlines = stat.read().splitlines()
         stat.close()
         
         self.corecount = -1
         
         for line in statlines:
             if (line[0:2] == "cp"):
                 self.corecount+= 1
             else:
                 break
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True))
         self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0)
         
         self.prev_idle.append(0)
         self.prev_total.append(0)
         self.idle.append(0)
         self.total.append(0)
     
     #---gpu_boxes---
     for i in range(self.deviceCount):
         product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i])
         product_name = product_name.decode('utf-8')
         
         gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8)
         
         label = Gtk.Label(product_name)
         
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True))
         
         gpu_box.pack_start(label, True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0)
         
         self.gpu_boxes.append(gpu_box)
     
     #---proc---
     proc_liststore = Gtk.ListStore(int, str, int)
     
     self.tree = Gtk.TreeView(model=proc_liststore)
     
     renderer_pid = Gtk.CellRendererText()
     column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0)
     column_pid.set_resizable(True)
     self.tree.append_column(column_pid)
     
     renderer_path = Gtk.CellRendererText()
     column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1)
     column_path.set_resizable(True)
     column_path.set_fixed_width(250)
     self.tree.append_column(column_path)
     
     renderer_mem = Gtk.CellRendererText()
     column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2)
     column_mem.set_resizable(True)
     self.tree.append_column(column_mem)
Пример #41
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()    # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None    # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Пример #42
0
def identify_cards():
    devices = {}
    try:
        import pynvml
        from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex
        deviceCount = None
        try:
            nvmlInit()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                props = {}
                def meminfo(memory):
                    return {
                            "total"  : int(memory.total),
                            "free"   : int(memory.free),
                            "used"   : int(memory.used),
                            }
                def pciinfo(pci):
                    i = {}
                    for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"):
                        try:
                            i[x] = int(getattr(pci, x))
                        except:
                            pass
                    try:
                        i["busId"] = str(pci.busId)
                    except:
                        pass
                    return i
                for prop, fn_name, args, conv in (
                       ("name",                     "nvmlDeviceGetName",                    (),     str),
                       ("serial",                   "nvmlDeviceGetSerial",                  (),     str),
                       ("uuid",                     "nvmlDeviceGetUUID",                    (),     str),
                       ("pci",                      "nvmlDeviceGetPciInfo",                 (),     pciinfo),
                       ("memory",                   "nvmlDeviceGetMemoryInfo",              (),     meminfo),
                       ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration",   (),     int),
                       ("pcie-link-width-max",      "nvmlDeviceGetMaxPcieLinkWidth",        (),     int),
                       ("pcie-link-generation",     "nvmlDeviceGetCurrPcieLinkGeneration",  (),     int),
                       ("pcie-link-width",          "nvmlDeviceGetCurrPcieLinkWidth",       (),     int),
                       ("clock-info-graphics",      "nvmlDeviceGetClockInfo",               (0,),   int),
                       ("clock-info-sm",            "nvmlDeviceGetClockInfo",               (1,),   int),
                       ("clock-info-mem",           "nvmlDeviceGetClockInfo",               (2,),   int),
                       ("clock-info-graphics-max",  "nvmlDeviceGetMaxClockInfo",            (0,),   int),
                       ("clock-info-sm-max",        "nvmlDeviceGetMaxClockInfo",            (1,),   int),
                       ("clock-info-mem-max",       "nvmlDeviceGetMaxClockInfo",            (2,),   int),
                       ("fan-speed",                "nvmlDeviceGetFanSpeed",                (),     int),
                       ("temperature",              "nvmlDeviceGetTemperature",             (0,),   int),
                       ("power-state",              "nvmlDeviceGetPowerState",              (),     int),
                       ("vbios-version",            "nvmlDeviceGetVbiosVersion",            (),     str),
                       ):
                    try:
                        fn = getattr(pynvml, fn_name)
                        v = fn(handle, *args)
                        if conv:
                            v = conv(v)
                        props[prop] = v
                    except Exception as e:
                        log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e)
                        continue
                devices[i] = props
            #unitCount = nvmlUnitGetCount()
            #log.info("unitCount=%s", unitCount)
        except Exception as e:
            log("identify_cards() pynvml error", exc_info=True)
            log.warn("Warning: failed to query the NVidia cards via NVML:")
            log.warn(" %s", e)
        finally:
            if deviceCount is not None:
                nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return devices
Пример #43
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Пример #44
0
 def __calculate_GPU_index(self, nNodes):
     pv.nvmlInit()
     nGPUs = int(pv.nvmlDeviceGetCount())
     rank = self.new_comm.Get_rank()
     return int(rank/nNodes) % nGPUs
Пример #45
0
 def get_device_count(self):
     """ Return count of Nvidia devices """
     try:
         self.device_count = pynvml.nvmlDeviceGetCount()
     except pynvml.NVMLError:
         self.device_count = 0
Пример #46
0
def get_device_handles():
    """Get a list of NVML device handles, one per device.

    Can throw NVMLError.
    """
    return [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(pynvml.nvmlDeviceGetCount())]
Пример #47
0
def count_gpus():
    nvmlInit()
    count = nvmlDeviceGetCount()
    nvmlShutdown()
    return count
Пример #48
0
	def do_GET(self):
		#checks if the server is alive
		if self.path == '/test':
			send_header(self)
			self.wfile.write(bytes('passed<br>', 'utf-8'))
			self.wfile.write(bytes('server is responding', 'utf-8'))
		#returns the running processes
		if self.path == '/runningProcesses':
			send_header(self)
			#send response:
			if modules['psutil']:
				for proc in psutil.process_iter():
					try:
						pinfo = proc.as_dict(attrs=['pid', 'name'])
					except psutil.NoSuchProcess:
						pass
					print(pinfo)
					self.wfile.write(bytes(str(pinfo), 'utf-8'))
			else:
				self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8')
		#returns the CPU utilization and number of cores
		elif self.path == '/cpuInfo':
			send_header(self)
			#get CPU info
			cpuInfo = {}
			if modules['psutil']:
				cpuInfo['CPU Utilization'] = int(psutil.cpu_percent())
				cpuInfo['CPU Cores'] = int(psutil.cpu_count())
			else:
				cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.'
			json_dump = json.dumps(cpuInfo)
			self.wfile.write(bytes(json_dump, 'utf-8'))
			#get GPU info
			if modules['pynvml']:
				try:
					pynvml.nvmlInit()
					gpus = pynvml.nvmlDeviceGetCount()
				except:
					gpus = 0
					self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8'))
			else:
				gpus = 0
				self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8'))
			for i in range(gpus):
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8'))
				try:
					self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '&deg;C', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8'))
				try:
					gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
					self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8'))
					self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8'))
			if gpus > 0:
				try:
					pynvml.nvmlShutdown()
				except:
					pass

		elif self.path == '/availableComputers':
			send_header(self)
			s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			s.connect(('google.com', 0))
			global myownsocket
			myownsocket = s.getsockname()[0]
			port = 8003
			available_computers = []
			for i in range(1, 256):
				host = '192.168.178.' + str(i) 
				sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
				sock.settimeout(0.2)
				try:
					alive = sock.connect_ex((host, port))
				except:
					alive = -1
				if alive == 0:
					print('available')
					
					available_computers.append(host)
				else:
					print('not available')
				print(host)
			self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8'))
			cmd_txt = """@echo off

call &quot;C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat&quot;

echo ##### start_rendering

xsibatch -render &quot;Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn&quot; -frames #1#-#2# -pass &quot;BEAUTY&quot; -skip on -verbose on

echo ##### rendering_done """
			self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8'))
			self.wfile.write(bytes('<table border="1">\n', 'utf-8'))
			self.wfile.write(bytes('<tr>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8'))

			available_cpus = {}
			for host in available_computers:
				available_cpus[host] = abs(get_cpu_cores(host))

			total_cpus = sum(available_cpus.values())

			frame_list = {}
			start_frame = 0
			for host in available_computers:
				start_frame += 1
				frame_list[host] = [start_frame]
				start_frame =  start_frame + int(100 * (available_cpus[host] / total_cpus))
				if start_frame > 100:
					start_frame = 100
				frame_list[host].append(start_frame)
			index = 0
			for host in available_computers:
				index += 1
				self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
				self.wfile.write(bytes(host, 'utf-8'))
				self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('</tr>', 'utf-8'))
			index = 2
			self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
			self.wfile.write(bytes(host, 'utf-8'))
			self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('</tr>', 'utf-8'))
				
			self.wfile.write(bytes('</table>\n', 'utf-8'))
			self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8'))
			self.wfile.write(bytes('</form>\n', 'utf-8'))
			self.wfile.write(bytes('</body>\n', 'utf-8'))
			self.wfile.write(bytes('</html>\n', 'utf-8'))
		elif self.path == '/execute_job':
			send_header(self)
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)

		elif '/submit_job' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)
			#print(parsed)
			print(parameters)
			self.wfile.write(bytes('<body>', 'utf-8'))
			for index in range(1, 100):
				if not parameters.get('host' + str(index)).strip():
					pass
				elif not parameters.get('start' + str(index)).strip():
					pass
				elif not parameters.get('end' + str(index)).strip():
					pass
				elif parameters.get('command'):
					cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip())
					cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip())
					self.wfile.write(bytes(escape(cmd_txt), 'utf-8'))
					self.wfile.write(bytes('<br>', 'utf-8'))
					print(cmd_txt)
			self.wfile.write(bytes('</body></html>', 'utf-8'))
		elif '/shutdown' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("Server will be shut down now......", 'utf-8'))
			server.shutdown()
			sys.exit()

		else:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("<br>", 'utf-8'))
			self.wfile.write(bytes(self.path, 'utf-8'))
			print(self.path)