Пример #1
0
 def measure_gpu_usage(self):
     from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \
                          nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError
     max_gpu_usage = []
     gpu_name = []
     try:
         nvmlInit()
         deviceCount = nvmlDeviceGetCount()
         max_gpu_usage = [0 for i in range(deviceCount)]
         gpu_name = [
             nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i))
             for i in range(deviceCount)
         ]
         while True:
             for i in range(deviceCount):
                 info = nvmlDeviceGetMemoryInfo(
                     nvmlDeviceGetHandleByIndex(i))
                 max_gpu_usage[i] = max(max_gpu_usage[i],
                                        info.used / 1024**2)
             sleep(0.005)  # 5ms
             if not self.keep_measuring:
                 break
         nvmlShutdown()
         return [{
             "device_id": i,
             "name": gpu_name[i],
             "max_used_MB": max_gpu_usage[i]
         } for i in range(deviceCount)]
     except NVMLError as error:
         if not self.silent:
             self.logger.error(
                 "Error fetching GPU information using nvml: %s", error)
         return None
Пример #2
0
 def _get_nvml_cuda_mapping():
     vis = os.environ.get('CUDA_VISIBLE_DEVICES')
     if vis is None:
         return [
             py3nvml.nvmlDeviceGetHandleByIndex(i)
             for i in range(torch.cuda.device_count())
         ]
     return [
         py3nvml.nvmlDeviceGetHandleByIndex(i)
         for i in map(int, vis.split(','))
     ]
Пример #3
0
        def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]:
            from py3nvml.py3nvml import (
                NVMLError,
                nvmlDeviceGetCount,
                nvmlDeviceGetHandleByIndex,
                nvmlDeviceGetMemoryInfo,
                nvmlDeviceGetName,
                nvmlInit,
                nvmlShutdown,
            )

            max_gpu_usage = []
            gpu_name = []
            try:
                nvmlInit()
                device_count = nvmlDeviceGetCount()
                if not isinstance(device_count, int):
                    logger.error(
                        f"nvmlDeviceGetCount result is not integer: {device_count}"
                    )
                    return None

                max_gpu_usage = [0 for i in range(device_count)]
                gpu_name = [
                    nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i))
                    for i in range(device_count)
                ]
                while True:
                    for i in range(device_count):
                        info = nvmlDeviceGetMemoryInfo(
                            nvmlDeviceGetHandleByIndex(i))
                        if isinstance(info, str):
                            logger.error(
                                f"nvmlDeviceGetMemoryInfo returns str: {info}")
                            return None
                        max_gpu_usage[i] = max(max_gpu_usage[i],
                                               info.used / 1024**2)
                    sleep(0.005)  # 5ms
                    if not self.keep_measuring:
                        break
                nvmlShutdown()
                return [{
                    "device_id": i,
                    "name": gpu_name[i],
                    "max_used_MB": max_gpu_usage[i],
                } for i in range(device_count)]
            except NVMLError as error:
                logger.error("Error fetching GPU information using nvml: %s",
                             error)
                return None
Пример #4
0
 def __init__(self):
     """Constructor."""
     nvml.nvmlInit()
     self._device_count = self.get_device_count()
     self._device_handlers = list()
     for i in range(self._device_count):
         self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i))
Пример #5
0
def query_gpu(index: int) -> Tuple[int, int, int]:

    h = nvml.nvmlDeviceGetHandleByIndex(index)

    #
    # Get memory info.
    #
    mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, h)
    if mem_info != 'N/A':
        mem_used = mem_info.used >> 20
        mem_total = mem_info.total >> 20
    else:
        mem_used = 0
        mem_total = 0

    #
    # Get utilization info
    #
    util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, h)
    if util != 'N/A':
        gpu_util = util.gpu
    else:
        gpu_util = 0

    return mem_used, mem_total, gpu_util
Пример #6
0
 def inference_speed_memory(self, batch_size, seq_length):
     # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
     key = jax.random.PRNGKey(0)
     input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
     @jax.jit
     def ref_step():
         out = self.model(input_ids=input_ids)
         return out[0]
     if jax.local_devices()[0].platform == 'gpu':
         nvml.nvmlInit()
         ref_step().block_until_ready()
         handle = nvml.nvmlDeviceGetHandleByIndex(0)
         meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
         max_bytes_in_use = meminfo.used
         memory = Memory(max_bytes_in_use)
         # shutdown nvml
         nvml.nvmlShutdown()
     else:
         memory = None
     timeit.repeat("ref_step().block_until_ready()", repeat=1, number=2,globals=locals())
     if self.jit:
         runtimes = timeit.repeat("ref_step().block_until_ready()", repeat=self.repeat,number=3,globals=locals())
     else:
         with jax.disable_jit():
             runtimes = timeit.repeat("ref_step().block_until_ready()",repeat=self.repeat,number=3,globals=locals())
     return float(np.min(runtimes)/3.0), memory
Пример #7
0
def getCUDAEnvironment():
    """ Get the CUDA runtime environment parameters (number of cards etc.). """

    rdict = dict()
    rdict['first_available_device_index'] = None
    rdict['device_count'] = 0

    try:
        nvml.nvmlInit()
        rdict['device_count'] = nvml.nvmlDeviceGetCount()

    except Exception:
        print(
            'WARNING: At least one of (py3nvml.nvml, CUDA) is not available. Will continue without GPU.'
        )
        return rdict

    for i in range(rdict['device_count']):
        memory_info = nvml.nvmlDeviceGetMemoryInfo(
            nvml.nvmlDeviceGetHandleByIndex(i))
        memory_usage_percentage = memory_info.used / memory_info.total

        if memory_usage_percentage <= 0.1:
            rdict['first_available_device_index'] = i
            break

    nvml.nvmlShutdown()

    return rdict
Пример #8
0
    def collect(self) -> Generator:
        hashrate = GaugeMetricFamily('miner_hashrate',
                                     'Hashrate',
                                     labels=['gpu_id', 'type'])
        efficiency = GaugeMetricFamily('miner_efficiency',
                                       'Efficiency',
                                       labels=['gpu_id', 'type'])
        pool_shares = GaugeMetricFamily('miner_pool_shares',
                                        'Pool Shares',
                                        labels=['type'])
        uptime = GaugeMetricFamily('miner_uptime', 'Uptime', labels=['type'])

        data = BMinerCollector.query_miner(self.host, self.port)

        uptime.add_metric(['miner'], int(time.time()) - data['start_time'])
        uptime.add_metric(['connection'], 0)
        pool_shares.add_metric(['accepted'],
                               data['stratum']['accepted_shares'])
        pool_shares.add_metric(['rejected'],
                               data['stratum']['rejected_shares'])
        for key, gpu in data['miners'].items():
            # setting CUDA_DEVICE_ORDER=PCI_BUS_ID env var is a must otherwise cuda id and bminer id are different!
            gpu_id = nvml.nvmlDeviceGetUUID(
                nvml.nvmlDeviceGetHandleByIndex(int(key)))
            hashrate.add_metric([gpu_id, 'current'],
                                gpu['solver']['solution_rate'])
            efficiency.add_metric(
                [gpu_id, 'current'],
                round(gpu['solver']['solution_rate'] / gpu['device']['power'],
                      2))

        yield uptime
        yield hashrate
        yield efficiency
        yield pool_shares
Пример #9
0
    def get_gpu_info_by_nvml(self) -> Dict:
        """Get GPU info using nvml"""
        gpu_info_list = []
        driver_version = None
        try:
            nvmlInit()
            driver_version = nvmlSystemGetDriverVersion()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                info = nvmlDeviceGetMemoryInfo(handle)
                gpu_info = {}
                gpu_info["memory_total"] = info.total
                gpu_info["memory_available"] = info.free
                gpu_info["name"] = nvmlDeviceGetName(handle)
                gpu_info_list.append(gpu_info)
            nvmlShutdown()
        except NVMLError as error:
            if not self.silent:
                self.logger.error(
                    "Error fetching GPU information using nvml: %s", error)
            return None

        result = {"driver_version": driver_version, "devices": gpu_info_list}

        if 'CUDA_VISIBLE_DEVICES' in environ:
            result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES']
        return result
Пример #10
0
def get_free_gpus():
    """ For an N gpu system, returns a list of N boolean values. The nth value
    will be True if no process was running on the nth gpu."""
    # Try connect with NVIDIA drivers
    logger = logging.getLogger(__name__)
    try:
        py3nvml.nvmlInit()
    except:
        str_ = """Couldn't connect to nvml drivers. Check they are installed correctly."""
        warnings.warn(str_, RuntimeWarning)
        logger.warn(str_)
        return []

    numDevices = py3nvml.nvmlDeviceGetCount()
    gpu_free = [False] * numDevices
    num_gpus = py3nvml.nvmlDeviceGetCount()
    for i in range(num_gpus):
        try:
            h = py3nvml.nvmlDeviceGetHandleByIndex(i)
        except:
            continue

        procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h,
                             ['something'])
        if len(procs) == 0:
            gpu_free[i] = True

    return gpu_free
def gpu_profile(frame, event, arg):
    # it is _about to_ execute (!)
    global last_tensor_sizes
    global lineno, func_name, filename, module_name

    if event == 'line':
        try:
            # about _previous_ line (!)
            if lineno is not None:
                py3nvml.nvmlInit()
                handle = py3nvml.nvmlDeviceGetHandleByIndex(
                    int(os.environ['GPU_DEBUG']))
                meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
                line = linecache.getline(filename, lineno)
                where_str = module_name + ' ' + func_name + ':' + str(lineno)

                with open(gpu_profile_fn, 'a+') as f:
                    f.write(f"{where_str:<50}"
                            f":{meminfo.used/1024**2:<7.1f}Mb "
                            f"{line.rstrip()}\n")

                    if print_tensor_sizes is True:
                        for tensor in get_tensors():
                            if not hasattr(tensor, 'dbg_alloc_where'):
                                tensor.dbg_alloc_where = where_str
                        new_tensor_sizes = {(type(x), tuple(x.size()),
                                             x.dbg_alloc_where)
                                            for x in get_tensors()}
                        for t, s, loc in new_tensor_sizes - last_tensor_sizes:
                            f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n')
                        for t, s, loc in last_tensor_sizes - new_tensor_sizes:
                            f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n')
                        last_tensor_sizes = new_tensor_sizes
                py3nvml.nvmlShutdown()

            # save details about line _to be_ executed
            lineno = None

            func_name = frame.f_code.co_name
            filename = frame.f_globals["__file__"]
            if (filename.endswith(".pyc") or filename.endswith(".pyo")):
                filename = filename[:-1]
            module_name = frame.f_globals["__name__"]
            lineno = frame.f_lineno

            if 'gmwda-pytorch' not in os.path.dirname(
                    os.path.abspath(filename)):
                lineno = None  # skip current line evaluation

            if ('car_datasets' in filename or '_exec_config' in func_name
                    or 'gpu_profile' in module_name
                    or 'tee_stdout' in module_name):
                lineno = None  # skip current

            return gpu_profile

        except (KeyError, AttributeError) as e:
            print(e)

    return gpu_profile
Пример #12
0
 def read_top_card_memory_in_bytes():
     # pylint: disable=no-member
     # pylint incorrectly detects that function nvmlDeviceGetMemoryInfo returns str
     return self.__nvml_get_or_else(lambda: [
         nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(card_index))
         .total for card_index in range(nvmlDeviceGetCount())
     ],
                                    default=0)
Пример #13
0
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["transformers_version"] = version
            info["framework"] = self.framework
            if self.framework == "PyTorch":
                info["use_torchscript"] = self.args.torchscript
            if self.framework == "TensorFlow":
                info["eager_mode"] = self.args.eager_mode
                info["use_xla"] = self.args.use_xla
            info["framework_version"] = self.framework_version
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
            info["fp16"] = self.args.fp16
            info["use_multiprocessing"] = self.args.do_multi_processing
            info["only_pretrain_model"] = self.args.only_pretrain_model

            if is_psutil_available():
                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory. "
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"

            info["use_gpu"] = self.args.is_gpu
            if self.args.is_gpu:
                info["num_gpus"] = 1  # TODO(PVP) Currently only single GPU is supported
                if is_py3nvml_available():
                    nvml.nvmlInit()
                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
                    info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
                    nvml.nvmlShutdown()
                else:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"

            info["use_tpu"] = self.args.is_tpu
            # TODO(PVP): See if we can add more information about TPU
            # see: https://github.com/pytorch/xla/issues/2180

            self._environment_info = info
        return self._environment_info
Пример #14
0
def get_device_handles():
    """Get a list of NVML device handles, one per device.

    Can throw NVMLError.
    """
    return [
        pynvml.nvmlDeviceGetHandleByIndex(i)
        for i in range(pynvml.nvmlDeviceGetCount())
    ]
Пример #15
0
def gpu_profile(frame, event):
    global last_meminfo_used, last_tensor_sizes
    global lineno, func_name, filename, module_name

    if event == 'line':
        try:
            if lineno:
                py3nvml.nvmlInit()
                handle = py3nvml.nvmlDeviceGetHandleByIndex(
                    int(os.environ["GPU_DEBUG"]))
                meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
                line = linecache.getline(filename, lineno)
                where_str = module_name + ' ' + func_name + ' ' + str(lineno)

                new_meminfo_used = meminfo.used
                mem_display = new_meminfo_used - last_meminfo_used if use_incremental else new_meminfo_used
                with open(gpu_profile_fn, "a+") as f:
                    f.write(f"{where_str:<50}"
                            f":{(mem_display) / 1024 ** 2:<7.1f}Mb "
                            f"{line.rstrip()}\n")

                    last_meminfo_used = new_meminfo_used
                    if print_tensor_sizes:
                        for tensor in get_tensors():
                            if not hasattr(tensor, 'dbg_alloc_where'):
                                tensor.dbg_alloc_where = where_str
                        new_tensor_sizes = {(type(x), tuple(x.size()),
                                             x.dbg_alloc_where)
                                            for x in get_tensors()}

                        for t, s, loc in new_tensor_sizes - last_tensor_sizes:
                            f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n')

                        for t, s, loc in last_tensor_sizes - new_tensor_sizes:
                            f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n')

                        last_tensor_sizes = new_tensor_sizes
                py3nvml.nvmlShutdown()

            lineno = None

            func_name = frame.f_code.co_name
            filename = frame.f_globals["__file__"]
            module_name = frame.f_globals["__name__"]
            lineno = frame.f_lineno

            if 'Beta' not in os.path.dirname(os.path.abspath(filename)):
                lineno = None

            return gpu_profile

        except (KeyError, AttributeError):
            pass

    return gpu_profile
Пример #16
0
    def __init__(self, index: int):
        self.index = index
        self.handle = py3nvml.nvmlDeviceGetHandleByIndex(index)

        self.name = py3nvml.nvmlDeviceGetName(self.handle)

        self.memory = Memory(self.handle)
        self.utilization = Utilization(self.handle)
        self.processes = Processes(self.handle)

        self.update()
Пример #17
0
    def __init__(self,
                 report=None,
                 devices=None,
                 quiet=False,
                 always_suffix=False,
                 output=print,
                 verbose_once=True):
        super(self.__class__, self).__init__()
        global nvml

        self.output = output

        if nvml is not None:
            try:
                nvml.nvmlInit()
            except (OSError, nvml.NVMLError_LibraryNotFound):
                # the python library might be installed, but not the drivers...
                nvml = None

        if nvml is None:
            if not quiet:
                self.output(
                    "Could not load py3nvml, cannot report any nvidia device statistics."
                )
            report = []
        else:
            device_count = nvml.nvmlDeviceGetCount()

            if devices is None:
                devices = list(range(device_count))
            else:
                devices = [
                    int(device) for device in devices
                    if 0 <= int(device) < device_count
                ]

            self.devices = devices
            self.deviceHandles = [
                nvml.nvmlDeviceGetHandleByIndex(device) for device in devices
            ]

            if not quiet:
                for n, handle in enumerate(self.deviceHandles):
                    self.output("Collecting statistics for device #% 2d: %s" %
                                (n, nvml.nvmlDeviceGetName(handle)))

        if report is None:
            report = ['temperature', 'utilization_gpu']
        elif report == 'all':
            report = list(self.reportable_values.keys())

        self.verbose_once = verbose_once
        self.report = report
        self.always_suffix = always_suffix
Пример #18
0
 def gpus(self):
     out = []
     with nvml_manager():
         if not pynvml:
             return out
         cpu_to_node = self.cpu_nodes()
         n_devices = pynvml.nvmlDeviceGetCount()
         for i in range(n_devices):
             handle = pynvml.nvmlDeviceGetHandleByIndex(i)
             out.append(GPU(handle, cpu_to_node))
     return out
Пример #19
0
def gpu_info():
    "Returns a tuple of (GPU ID, GPU Description, GPU % Utilization)"
    nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    info = []
    for i in range(0, deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        util = nvmlDeviceGetUtilizationRates(handle)
        desc = nvmlDeviceGetName(handle)
        info.append(
            (i, desc, util.gpu))  #['GPU %i - %s' % (i, desc)] = util.gpu
    return info
Пример #20
0
def test_nvidia_device(idx: int):
    from py3nvml import py3nvml as nvml

    handle = nvml.nvmlDeviceGetHandleByIndex(idx)

    pciInfo = nvml.nvmlDeviceGetPciInfo(handle)

    brands = {
        nvml.NVML_BRAND_UNKNOWN: "Unknown",
        nvml.NVML_BRAND_QUADRO: "Quadro",
        nvml.NVML_BRAND_TESLA: "Tesla",
        nvml.NVML_BRAND_NVS: "NVS",
        nvml.NVML_BRAND_GRID: "Grid",
        nvml.NVML_BRAND_GEFORCE: "GeForce"
    }

    inspect(
        idx=idx,
        # id=pciInfo.busId,
        # uuid=nvml.nvmlDeviceGetUUID(handle),
        name=nvml.nvmlDeviceGetName(handle),
        # brand=brands[nvml.nvmlDeviceGetBrand(handle)],
        # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle),
        # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle),
        fan=nvml.nvmlDeviceGetFanSpeed(handle),
        # power=nvml.nvmlDeviceGetPowerState(handle),
        mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total,
        mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used,
        util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu,
        # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory,
        temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU),
        power=nvml.nvmlDeviceGetPowerUsage(handle),
        power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle),

        # display=nvml.nvmlDeviceGetDisplayMode(handle),
        display_active=nvml.nvmlDeviceGetDisplayActive(handle),
    )

    logger.log()

    procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle)
    for p in procs:
        inspect(name=nvml.nvmlSystemGetProcessName(p.pid),
                pid=p.pid,
                mem=p.usedGpuMemory)

    logger.log()
Пример #21
0
def get_gpu_info() -> Optional[List[Dict[str, Any]]]:
    from py3nvml.py3nvml import (
        NVMLError,
        nvmlDeviceGetCount,
        nvmlDeviceGetHandleByIndex,
        nvmlDeviceGetMemoryInfo,
        nvmlDeviceGetName,
        nvmlInit,
        nvmlShutdown,
    )

    try:
        nvmlInit()
        result = []
        device_count = nvmlDeviceGetCount()
        if not isinstance(device_count, int):
            return None

        for i in range(device_count):
            info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i))
            if isinstance(info, str):
                return None
            result.append({
                "id":
                i,
                "name":
                nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)),
                "total":
                info.total,
                "free":
                info.free,
                "used":
                info.used,
            })
        nvmlShutdown()
        return result
    except NVMLError as error:
        print("Error fetching GPU information using nvml: %s", error)
        return None
Пример #22
0
def get_gpu_info() -> Tuple[Optional[str], Optional[List[GpuInfo]]]:
    """
    Get driver version and list of ``GpuInfo``, if available.
    """
    try:
        nvml.nvmlInit()
    except nvml.NVMLError:
        # Not available.
        return None, None

    driver_version: str = nvml.nvmlSystemGetDriverVersion()
    gpus: List[GpuInfo] = []

    device_count: int = nvml.nvmlDeviceGetCount()
    for i in range(device_count):
        handle = nvml.nvmlDeviceGetHandleByIndex(i)
        name = try_get_info(nvml.nvmlDeviceGetName, handle)
        fan_speed = try_get_info(nvml.nvmlDeviceGetFanSpeed, handle, default=0)
        temp = try_get_info(
            lambda h: nvml.nvmlDeviceGetTemperature(h, nvml.
                                                    NVML_TEMPERATURE_GPU),
            handle,
            default=0,
        )
        mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle)
        if mem_info:
            mem_used = mem_info.used >> 20
            mem_total = mem_info.total >> 20
        else:
            mem_used = 0
            mem_total = 0
        util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
        if util:
            gpu_util = util.gpu
        else:
            gpu_util = 0
        gpus.append(
            GpuInfo(
                id=i,
                name=name,
                mem_usage=mem_used,
                mem_capacity=mem_total,
                utilization=gpu_util,
                temp=temp,
                fan=fan_speed,
            ))

    nvml.nvmlShutdown()

    return driver_version, gpus
Пример #23
0
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["gluonnlp_version"] = gluonnlp.__version__
            info["framework_version"] = mxnet.__version__
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
            info["fp16"] = self._use_fp16

            if is_psutil_available():
                info["cpu_ram_mb"] = bytes_to_mega_bytes(
                    psutil.virtual_memory().total)
            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory."
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"

            info["use_gpu"] = self._use_gpu
            if self._use_gpu:
                info["num_gpus"] = 1
                if is_py3nvml_available():
                    nvml.nvmlInit()
                    handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(
                        nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info[
                        "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(
                            handle) / 1000
                    info[
                        "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(
                            handle)
                    nvml.nvmlShutdown()
                else:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
            self._environment_info = info
        return self._environment_info
Пример #24
0
    def train_speed_memory(self, batch_size, seq_length):
        key = jax.random.PRNGKey(0)
        input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
        targets = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
        labels = jax.random.randint(key, (batch_size, seq_length), 0, 2)
        # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
        # targets = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
        # labels = np.random.randint(0,2, (batch_size, seq_length))
        @jax.jit
        def train_step():

            def loss_fn(params):
                token_mask = jnp.where(labels > 0, 1.0, 0.0).astype(self.dtype)
                logits = self.model(input_ids=input_ids, train=True, params=params, dropout_rng=jax.random.PRNGKey(0))[0]
                loss, normalizing_factor = cross_entropy(logits,targets, token_mask)
                jax.profiler.save_device_memory_profile(f"memory/{workload[0]}_{workload[1]}_memory.prof", "gpu")
                return loss / normalizing_factor
            if self.fp16 and jax.local_devices()[0].platform == 'gpu':
                grad_fn = self.dynamic_scale.value_and_grad(loss_fn)
                dyn_scale, is_fin, loss, grad = grad_fn(self.model.params)
            else:
                grad_fn = jax.value_and_grad(loss_fn)
                loss, grad = grad_fn(self.model.params)
            return tree_flatten(grad)[0]


        if jax.local_devices()[0].platform == 'gpu':
            nvml.nvmlInit()
            train_step()
            handle = nvml.nvmlDeviceGetHandleByIndex(0)
            meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
            max_bytes_in_use = meminfo.used
            memory = Memory(max_bytes_in_use)
            # shutdown nvml
            nvml.nvmlShutdown()
        else:
            memory = None
        # timeit.repeat(train_step,repeat=1,number=2)
        timeit.repeat("for i in train_step():i.block_until_ready()", repeat=1, number=2,globals=locals())
        if self.jit:
            # runtimes = timeit.repeat(train_step,repeat=self.repeat,number=3)
            runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals())
        else:
            with jax.disable_jit():
                # runtimes = timeit.repeat(train_step, repeat=self.repeat, number=3)
                runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals())


        return float(np.min(runtimes)/3.0), memory
Пример #25
0
def run_gpu_mem_counter(do_shutdown=False):
    # Sum used memory for all GPUs
    if not torch.cuda.is_available(): return 0
    if do_shutdown:
        py3nvml.nvmlInit()
    devices = list(range(py3nvml.nvmlDeviceGetCount())
                   )  #if gpus_to_trace is None else gpus_to_trace
    gpu_mem = 0
    for i in devices:
        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
        meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
        gpu_mem += meminfo.used
    if do_shutdown:
        py3nvml.nvmlShutdown()
    return gpu_mem
Пример #26
0
    def __init__(self):
        self.labels = ['gpu', 'name', 'driver']
        self.driver = nv.nvmlSystemGetDriverVersion()

        self.n_gpu = nv.nvmlDeviceGetCount()
        self.hnds = [
            nv.nvmlDeviceGetHandleByIndex(i) for i in range(self.n_gpu)
        ]
        self.args = []
        for i, hnd in enumerate(self.hnds):
            args = OrderedDict()
            args['gpu'] = 'gpu%d' % i
            args['name'] = nv.nvmlDeviceGetName(hnd)
            args['driver'] = self.driver
            self.args.append(args)
Пример #27
0
def get_free_gpus(max_procs=0):
    """
    Checks the number of processes running on your GPUs.

    Parameters
    ----------
    max_procs : int
        Maximum number of procs allowed to run on a gpu for it to be considered
        'available'

    Returns
    -------
    availabilities : list(bool)
        List of length N for an N-gpu system. The nth value will be true, if the
        nth gpu had at most max_procs processes running on it. Set to 0 to look
        for gpus with no procs on it.

    Note
    ----
    If function can't query the driver will return an empty list rather than raise an
    Exception.
    """
    # Try connect with NVIDIA drivers
    logger = logging.getLogger(__name__)
    try:
        py3nvml.nvmlInit()
    except:
        str_ = """Couldn't connect to nvml drivers. Check they are installed correctly."""
        warnings.warn(str_, RuntimeWarning)
        logger.warning(str_)
        return []

    num_gpus = py3nvml.nvmlDeviceGetCount()
    gpu_free = [False] * num_gpus
    for i in range(num_gpus):
        try:
            h = py3nvml.nvmlDeviceGetHandleByIndex(i)
        except:
            continue

        procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h,
                             ['something'])
        if len(procs) <= max_procs:
            gpu_free[i] = True

    py3nvml.nvmlShutdown()
    return gpu_free
Пример #28
0
    def _measure_memory(self, func: Callable[[],
                                             None]) -> [Memory, MemorySummary]:
        try:
            if self.args.trace_memory_line_by_line:
                trace = start_memory_tracing("transformers")

            if self.args.is_tpu:
                # tpu
                raise NotImplementedError(
                    "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no-memory` or `args.memory=False`"
                )
            elif self.args.is_gpu:
                if not is_py3nvml_available():
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    memory = "N/A"
                else:
                    logger.info(
                        "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
                    )
                    # init nvml
                    nvml.nvmlInit()
                    func()
                    handle = nvml.nvmlDeviceGetHandleByIndex(
                        self.args.device_idx)
                    meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
                    max_bytes_in_use = meminfo.used
                    memory = Memory(max_bytes_in_use)
                    # shutdown nvml
                    nvml.nvmlShutdown()
            else:
                # cpu
                memory_bytes = measure_peak_memory_cpu(func)
                memory = Memory(memory_bytes) if isinstance(
                    memory_bytes, int) else memory_bytes

            if self.args.trace_memory_line_by_line:
                summary = stop_memory_tracing(trace)
            else:
                summary = None

            return memory, summary
        except RuntimeError as e:
            self.print_fn(f"Doesn't fit on GPU. {e}")
            return "N/A", None
Пример #29
0
 def _get_current_power(self, arrange_next=True):
     # return in kilo watt
     if self.gpu:
         num_gpus = nvmlDeviceGetCount()
         current_power = 0
         for i in range(num_gpus):
             h = nvmlDeviceGetHandleByIndex(i)
             power = try_get_info(nvmlDeviceGetPowerUsage, h, "-1")
             current_power += power / 1000
         if arrange_next:
             self.schedule.enter(self.interval, 1, self._get_current_power)
         else:
             pass
     else:
         current_power = 0
     self.powers.append(current_power)
     return current_power
def memory_status(msg="", reset_max=True, sync=True):

    rank = smp.rank()
    tp_rank = smp.tp_rank()
    pp_rank = smp.pp_rank()
    rdp_rank = smp.rdp_rank()
    local_rank = smp.local_rank()

    if sync:
        torch.cuda.synchronize()

    if rdp_rank != 0:
        return

    if py3nvml != None:
        py3nvml.nvmlInit()
        handle = py3nvml.nvmlDeviceGetHandleByIndex(local_rank)
        info = py3nvml.nvmlDeviceGetMemoryInfo(handle)
        total_used = info.used / 1024**3
        total_used_str = f"Totally used GPU memory: {total_used}"
    else:
        total_used_str = ""

    alloced = torch.cuda.memory_allocated(device=local_rank)
    max_alloced = torch.cuda.max_memory_allocated(device=local_rank)
    cached = torch.cuda.memory_reserved(device=local_rank)
    max_cached = torch.cuda.max_memory_reserved(device=local_rank)

    # convert to GB for printing
    alloced /= 1024**3
    cached /= 1024**3
    max_alloced /= 1024**3
    max_cached /= 1024**3

    print(
        f'[{msg}] rank {rank} tp_rank {tp_rank} pp_rank {pp_rank} TORCH {torch.__version__}',
        f'device={local_rank} '
        f'alloc {alloced:0.4f} max_alloced {max_alloced:0.4f} '
        f'cache {cached:0.4f} max_cached {max_cached:0.4f} '
        f'{total_used_str}')
    if reset_max:
        torch.cuda.reset_max_memory_cached()
        torch.cuda.reset_max_memory_allocated()
    if py3nvml != None:
        py3nvml.nvmlShutdown()