Пример #1
0
def require_nvml(*args, **kwargs):
    try:
        from py3nvml import py3nvml
        py3nvml.nvmlInit()
        return True
    except:
        return False
Пример #2
0
def get_free_gpus():
    """ For an N gpu system, returns a list of N boolean values. The nth value
    will be True if no process was running on the nth gpu."""
    # Try connect with NVIDIA drivers
    logger = logging.getLogger(__name__)
    try:
        py3nvml.nvmlInit()
    except:
        str_ = """Couldn't connect to nvml drivers. Check they are installed correctly."""
        warnings.warn(str_, RuntimeWarning)
        logger.warn(str_)
        return []

    numDevices = py3nvml.nvmlDeviceGetCount()
    gpu_free = [False] * numDevices
    num_gpus = py3nvml.nvmlDeviceGetCount()
    for i in range(num_gpus):
        try:
            h = py3nvml.nvmlDeviceGetHandleByIndex(i)
        except:
            continue

        procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h,
                             ['something'])
        if len(procs) == 0:
            gpu_free[i] = True

    return gpu_free
def gpu_profile(frame, event, arg):
    # it is _about to_ execute (!)
    global last_tensor_sizes
    global lineno, func_name, filename, module_name

    if event == 'line':
        try:
            # about _previous_ line (!)
            if lineno is not None:
                py3nvml.nvmlInit()
                handle = py3nvml.nvmlDeviceGetHandleByIndex(
                    int(os.environ['GPU_DEBUG']))
                meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
                line = linecache.getline(filename, lineno)
                where_str = module_name + ' ' + func_name + ':' + str(lineno)

                with open(gpu_profile_fn, 'a+') as f:
                    f.write(f"{where_str:<50}"
                            f":{meminfo.used/1024**2:<7.1f}Mb "
                            f"{line.rstrip()}\n")

                    if print_tensor_sizes is True:
                        for tensor in get_tensors():
                            if not hasattr(tensor, 'dbg_alloc_where'):
                                tensor.dbg_alloc_where = where_str
                        new_tensor_sizes = {(type(x), tuple(x.size()),
                                             x.dbg_alloc_where)
                                            for x in get_tensors()}
                        for t, s, loc in new_tensor_sizes - last_tensor_sizes:
                            f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n')
                        for t, s, loc in last_tensor_sizes - new_tensor_sizes:
                            f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n')
                        last_tensor_sizes = new_tensor_sizes
                py3nvml.nvmlShutdown()

            # save details about line _to be_ executed
            lineno = None

            func_name = frame.f_code.co_name
            filename = frame.f_globals["__file__"]
            if (filename.endswith(".pyc") or filename.endswith(".pyo")):
                filename = filename[:-1]
            module_name = frame.f_globals["__name__"]
            lineno = frame.f_lineno

            if 'gmwda-pytorch' not in os.path.dirname(
                    os.path.abspath(filename)):
                lineno = None  # skip current line evaluation

            if ('car_datasets' in filename or '_exec_config' in func_name
                    or 'gpu_profile' in module_name
                    or 'tee_stdout' in module_name):
                lineno = None  # skip current

            return gpu_profile

        except (KeyError, AttributeError) as e:
            print(e)

    return gpu_profile
Пример #4
0
 def measure_gpu_usage(self):
     from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \
                          nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError
     max_gpu_usage = []
     gpu_name = []
     try:
         nvmlInit()
         deviceCount = nvmlDeviceGetCount()
         max_gpu_usage = [0 for i in range(deviceCount)]
         gpu_name = [
             nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i))
             for i in range(deviceCount)
         ]
         while True:
             for i in range(deviceCount):
                 info = nvmlDeviceGetMemoryInfo(
                     nvmlDeviceGetHandleByIndex(i))
                 max_gpu_usage[i] = max(max_gpu_usage[i],
                                        info.used / 1024**2)
             sleep(0.005)  # 5ms
             if not self.keep_measuring:
                 break
         nvmlShutdown()
         return [{
             "device_id": i,
             "name": gpu_name[i],
             "max_used_MB": max_gpu_usage[i]
         } for i in range(deviceCount)]
     except NVMLError as error:
         if not self.silent:
             self.logger.error(
                 "Error fetching GPU information using nvml: %s", error)
         return None
Пример #5
0
 def inference_speed_memory(self, batch_size, seq_length):
     # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
     key = jax.random.PRNGKey(0)
     input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
     @jax.jit
     def ref_step():
         out = self.model(input_ids=input_ids)
         return out[0]
     if jax.local_devices()[0].platform == 'gpu':
         nvml.nvmlInit()
         ref_step().block_until_ready()
         handle = nvml.nvmlDeviceGetHandleByIndex(0)
         meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
         max_bytes_in_use = meminfo.used
         memory = Memory(max_bytes_in_use)
         # shutdown nvml
         nvml.nvmlShutdown()
     else:
         memory = None
     timeit.repeat("ref_step().block_until_ready()", repeat=1, number=2,globals=locals())
     if self.jit:
         runtimes = timeit.repeat("ref_step().block_until_ready()", repeat=self.repeat,number=3,globals=locals())
     else:
         with jax.disable_jit():
             runtimes = timeit.repeat("ref_step().block_until_ready()",repeat=self.repeat,number=3,globals=locals())
     return float(np.min(runtimes)/3.0), memory
Пример #6
0
 def __init__(self):
     """Constructor."""
     nvml.nvmlInit()
     self._device_count = self.get_device_count()
     self._device_handlers = list()
     for i in range(self._device_count):
         self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i))
Пример #7
0
    def get_gpu_info_by_nvml(self) -> Dict:
        """Get GPU info using nvml"""
        gpu_info_list = []
        driver_version = None
        try:
            nvmlInit()
            driver_version = nvmlSystemGetDriverVersion()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                info = nvmlDeviceGetMemoryInfo(handle)
                gpu_info = {}
                gpu_info["memory_total"] = info.total
                gpu_info["memory_available"] = info.free
                gpu_info["name"] = nvmlDeviceGetName(handle)
                gpu_info_list.append(gpu_info)
            nvmlShutdown()
        except NVMLError as error:
            if not self.silent:
                self.logger.error(
                    "Error fetching GPU information using nvml: %s", error)
            return None

        result = {"driver_version": driver_version, "devices": gpu_info_list}

        if 'CUDA_VISIBLE_DEVICES' in environ:
            result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES']
        return result
Пример #8
0
def main():
    args = parse_args()
    pool_collector = None

    urllib3.disable_warnings()
    nvml.nvmlInit()
    atexit.register(nvml.nvmlShutdown)
    REGISTRY.register(NvidiaCollector())
    if args['pool'] is not None:
        pool_collector = pool_collectors()[args['pool'].lower()](
            args['pool_api_host'], args['pool_api_miner'])
        pool_collector.query_pool()
        REGISTRY.register(pool_collector)
    if args['miner'] is not None:
        REGISTRY.register(miner_collectors()[args['miner'].lower()](
            args['miner_api_host'], args['miner_api_port']))

    print('Starting exporter...')
    try:
        start_http_server(args['port'])
        while True:
            time.sleep(
                60)  # 1 query per minute so we don't reach API request limits
            if pool_collector is not None:
                pool_collector.query_pool()
    except KeyboardInterrupt:
        print('Exiting...')
        exit(0)
Пример #9
0
def getCUDAEnvironment():
    """ Get the CUDA runtime environment parameters (number of cards etc.). """

    rdict = dict()
    rdict['first_available_device_index'] = None
    rdict['device_count'] = 0

    try:
        nvml.nvmlInit()
        rdict['device_count'] = nvml.nvmlDeviceGetCount()

    except Exception:
        print(
            'WARNING: At least one of (py3nvml.nvml, CUDA) is not available. Will continue without GPU.'
        )
        return rdict

    for i in range(rdict['device_count']):
        memory_info = nvml.nvmlDeviceGetMemoryInfo(
            nvml.nvmlDeviceGetHandleByIndex(i))
        memory_usage_percentage = memory_info.used / memory_info.total

        if memory_usage_percentage <= 0.1:
            rdict['first_available_device_index'] = i
            break

    nvml.nvmlShutdown()

    return rdict
Пример #10
0
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["transformers_version"] = version
            info["framework"] = self.framework
            if self.framework == "PyTorch":
                info["use_torchscript"] = self.args.torchscript
            if self.framework == "TensorFlow":
                info["eager_mode"] = self.args.eager_mode
                info["use_xla"] = self.args.use_xla
            info["framework_version"] = self.framework_version
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
            info["fp16"] = self.args.fp16
            info["use_multiprocessing"] = self.args.do_multi_processing
            info["only_pretrain_model"] = self.args.only_pretrain_model

            if is_psutil_available():
                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory. "
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"

            info["use_gpu"] = self.args.is_gpu
            if self.args.is_gpu:
                info["num_gpus"] = 1  # TODO(PVP) Currently only single GPU is supported
                if is_py3nvml_available():
                    nvml.nvmlInit()
                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
                    info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
                    nvml.nvmlShutdown()
                else:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"

            info["use_tpu"] = self.args.is_tpu
            # TODO(PVP): See if we can add more information about TPU
            # see: https://github.com/pytorch/xla/issues/2180

            self._environment_info = info
        return self._environment_info
Пример #11
0
def gpu_profile(frame, event):
    global last_meminfo_used, last_tensor_sizes
    global lineno, func_name, filename, module_name

    if event == 'line':
        try:
            if lineno:
                py3nvml.nvmlInit()
                handle = py3nvml.nvmlDeviceGetHandleByIndex(
                    int(os.environ["GPU_DEBUG"]))
                meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
                line = linecache.getline(filename, lineno)
                where_str = module_name + ' ' + func_name + ' ' + str(lineno)

                new_meminfo_used = meminfo.used
                mem_display = new_meminfo_used - last_meminfo_used if use_incremental else new_meminfo_used
                with open(gpu_profile_fn, "a+") as f:
                    f.write(f"{where_str:<50}"
                            f":{(mem_display) / 1024 ** 2:<7.1f}Mb "
                            f"{line.rstrip()}\n")

                    last_meminfo_used = new_meminfo_used
                    if print_tensor_sizes:
                        for tensor in get_tensors():
                            if not hasattr(tensor, 'dbg_alloc_where'):
                                tensor.dbg_alloc_where = where_str
                        new_tensor_sizes = {(type(x), tuple(x.size()),
                                             x.dbg_alloc_where)
                                            for x in get_tensors()}

                        for t, s, loc in new_tensor_sizes - last_tensor_sizes:
                            f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n')

                        for t, s, loc in last_tensor_sizes - new_tensor_sizes:
                            f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n')

                        last_tensor_sizes = new_tensor_sizes
                py3nvml.nvmlShutdown()

            lineno = None

            func_name = frame.f_code.co_name
            filename = frame.f_globals["__file__"]
            module_name = frame.f_globals["__name__"]
            lineno = frame.f_lineno

            if 'Beta' not in os.path.dirname(os.path.abspath(filename)):
                lineno = None

            return gpu_profile

        except (KeyError, AttributeError):
            pass

    return gpu_profile
Пример #12
0
    def __init__(self,
                 report=None,
                 devices=None,
                 quiet=False,
                 always_suffix=False,
                 output=print,
                 verbose_once=True):
        super(self.__class__, self).__init__()
        global nvml

        self.output = output

        if nvml is not None:
            try:
                nvml.nvmlInit()
            except (OSError, nvml.NVMLError_LibraryNotFound):
                # the python library might be installed, but not the drivers...
                nvml = None

        if nvml is None:
            if not quiet:
                self.output(
                    "Could not load py3nvml, cannot report any nvidia device statistics."
                )
            report = []
        else:
            device_count = nvml.nvmlDeviceGetCount()

            if devices is None:
                devices = list(range(device_count))
            else:
                devices = [
                    int(device) for device in devices
                    if 0 <= int(device) < device_count
                ]

            self.devices = devices
            self.deviceHandles = [
                nvml.nvmlDeviceGetHandleByIndex(device) for device in devices
            ]

            if not quiet:
                for n, handle in enumerate(self.deviceHandles):
                    self.output("Collecting statistics for device #% 2d: %s" %
                                (n, nvml.nvmlDeviceGetName(handle)))

        if report is None:
            report = ['temperature', 'utilization_gpu']
        elif report == 'all':
            report = list(self.reportable_values.keys())

        self.verbose_once = verbose_once
        self.report = report
        self.always_suffix = always_suffix
Пример #13
0
    def __init__(self, index=None):
        self.no_gpu = False

        if not torch.cuda.is_available():
            print("GPU not found. GPU stats not available")
            self.no_gpu = True
            return

        nvmlInit()
        self._finalizer = weakref.finalize(self, nvmlShutdown)

        self.handle = _torch_gpu_index_to_nvml_handle(index)
Пример #14
0
def nvml_manager():
    """Context manager to initialise and shut down NVML."""
    global pynvml
    if pynvml:
        try:
            pynvml.nvmlInit()
        except pynvml.NVMLError as error:
            print('Warning:', error, file=sys.stderr)
            pynvml = None
    yield pynvml
    if pynvml:
        pynvml.nvmlShutdown()
Пример #15
0
def check_has_nvml() -> bool:
    """
    Determines if libnvml is available on the system.
    True means CUDA is available

    :rtype: bool
    """
    try:
        nvmlInit()
        return True
    except NVMLError:
        return False
Пример #16
0
 def __nvml_get_or_else(self, getter, default=None):
     try:
         nvmlInit()
         return getter()
     except NVMLError as e:
         timestamp = time.time()
         if timestamp - GPUMonitor.nvml_error_time > GPUMonitor.nvml_error_period:
             _logger.warning(
                 "NVMLError: %s - GPU usage metrics may not be reported.",
                 e)
             GPUMonitor.nvml_error_time = timestamp
         return default
Пример #17
0
def gpu_info():
    "Returns a tuple of (GPU ID, GPU Description, GPU % Utilization)"
    nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    info = []
    for i in range(0, deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        util = nvmlDeviceGetUtilizationRates(handle)
        desc = nvmlDeviceGetName(handle)
        info.append(
            (i, desc, util.gpu))  #['GPU %i - %s' % (i, desc)] = util.gpu
    return info
Пример #18
0
        def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]:
            from py3nvml.py3nvml import (
                NVMLError,
                nvmlDeviceGetCount,
                nvmlDeviceGetHandleByIndex,
                nvmlDeviceGetMemoryInfo,
                nvmlDeviceGetName,
                nvmlInit,
                nvmlShutdown,
            )

            max_gpu_usage = []
            gpu_name = []
            try:
                nvmlInit()
                device_count = nvmlDeviceGetCount()
                if not isinstance(device_count, int):
                    logger.error(
                        f"nvmlDeviceGetCount result is not integer: {device_count}"
                    )
                    return None

                max_gpu_usage = [0 for i in range(device_count)]
                gpu_name = [
                    nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i))
                    for i in range(device_count)
                ]
                while True:
                    for i in range(device_count):
                        info = nvmlDeviceGetMemoryInfo(
                            nvmlDeviceGetHandleByIndex(i))
                        if isinstance(info, str):
                            logger.error(
                                f"nvmlDeviceGetMemoryInfo returns str: {info}")
                            return None
                        max_gpu_usage[i] = max(max_gpu_usage[i],
                                               info.used / 1024**2)
                    sleep(0.005)  # 5ms
                    if not self.keep_measuring:
                        break
                nvmlShutdown()
                return [{
                    "device_id": i,
                    "name": gpu_name[i],
                    "max_used_MB": max_gpu_usage[i],
                } for i in range(device_count)]
            except NVMLError as error:
                logger.error("Error fetching GPU information using nvml: %s",
                             error)
                return None
Пример #19
0
    def environment_info(self):
        if self._environment_info is None:
            info = {}
            info["gluonnlp_version"] = gluonnlp.__version__
            info["framework_version"] = mxnet.__version__
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
            info["cpu"] = platform.processor()
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
            info["fp16"] = self._use_fp16

            if is_psutil_available():
                info["cpu_ram_mb"] = bytes_to_mega_bytes(
                    psutil.virtual_memory().total)
            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory."
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"

            info["use_gpu"] = self._use_gpu
            if self._use_gpu:
                info["num_gpus"] = 1
                if is_py3nvml_available():
                    nvml.nvmlInit()
                    handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
                    info["gpu_ram_mb"] = bytes_to_mega_bytes(
                        nvml.nvmlDeviceGetMemoryInfo(handle).total)
                    info[
                        "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(
                            handle) / 1000
                    info[
                        "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(
                            handle)
                    nvml.nvmlShutdown()
                else:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    info["gpu"] = "N/A"
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
            self._environment_info = info
        return self._environment_info
Пример #20
0
    def init_nvidia(self):
        """Init the NVIDIA API."""
        if import_error_tag:
            self.nvml_ready = False

        try:
            pynvml.nvmlInit()
            self.device_handles = get_device_handles()
            self.nvml_ready = True
        except Exception:
            logger.debug("pynvml could not be initialized.")
            self.nvml_ready = False

        return self.nvml_ready
Пример #21
0
 def __init__(self, interval=20):
     try:
         nvmlInit()
         self.gpu = True
     except:
         self.gpu = False
     self.interval = interval
     self.schedule = sched.scheduler(time.time, time.sleep)
     self.powers = []
     self.thread = Thread(target=self._get_power_period,
                          name="powermeter_thread")
     self.thread.setDaemon(True)
     self.thread.start()
     self.sum = 0
Пример #22
0
def get_gpu_info() -> Tuple[Optional[str], Optional[List[GpuInfo]]]:
    """
    Get driver version and list of ``GpuInfo``, if available.
    """
    try:
        nvml.nvmlInit()
    except nvml.NVMLError:
        # Not available.
        return None, None

    driver_version: str = nvml.nvmlSystemGetDriverVersion()
    gpus: List[GpuInfo] = []

    device_count: int = nvml.nvmlDeviceGetCount()
    for i in range(device_count):
        handle = nvml.nvmlDeviceGetHandleByIndex(i)
        name = try_get_info(nvml.nvmlDeviceGetName, handle)
        fan_speed = try_get_info(nvml.nvmlDeviceGetFanSpeed, handle, default=0)
        temp = try_get_info(
            lambda h: nvml.nvmlDeviceGetTemperature(h, nvml.
                                                    NVML_TEMPERATURE_GPU),
            handle,
            default=0,
        )
        mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle)
        if mem_info:
            mem_used = mem_info.used >> 20
            mem_total = mem_info.total >> 20
        else:
            mem_used = 0
            mem_total = 0
        util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
        if util:
            gpu_util = util.gpu
        else:
            gpu_util = 0
        gpus.append(
            GpuInfo(
                id=i,
                name=name,
                mem_usage=mem_used,
                mem_capacity=mem_total,
                utilization=gpu_util,
                temp=temp,
                fan=fan_speed,
            ))

    nvml.nvmlShutdown()

    return driver_version, gpus
Пример #23
0
    def train_speed_memory(self, batch_size, seq_length):
        key = jax.random.PRNGKey(0)
        input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
        targets = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
        labels = jax.random.randint(key, (batch_size, seq_length), 0, 2)
        # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
        # targets = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
        # labels = np.random.randint(0,2, (batch_size, seq_length))
        @jax.jit
        def train_step():

            def loss_fn(params):
                token_mask = jnp.where(labels > 0, 1.0, 0.0).astype(self.dtype)
                logits = self.model(input_ids=input_ids, train=True, params=params, dropout_rng=jax.random.PRNGKey(0))[0]
                loss, normalizing_factor = cross_entropy(logits,targets, token_mask)
                jax.profiler.save_device_memory_profile(f"memory/{workload[0]}_{workload[1]}_memory.prof", "gpu")
                return loss / normalizing_factor
            if self.fp16 and jax.local_devices()[0].platform == 'gpu':
                grad_fn = self.dynamic_scale.value_and_grad(loss_fn)
                dyn_scale, is_fin, loss, grad = grad_fn(self.model.params)
            else:
                grad_fn = jax.value_and_grad(loss_fn)
                loss, grad = grad_fn(self.model.params)
            return tree_flatten(grad)[0]


        if jax.local_devices()[0].platform == 'gpu':
            nvml.nvmlInit()
            train_step()
            handle = nvml.nvmlDeviceGetHandleByIndex(0)
            meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
            max_bytes_in_use = meminfo.used
            memory = Memory(max_bytes_in_use)
            # shutdown nvml
            nvml.nvmlShutdown()
        else:
            memory = None
        # timeit.repeat(train_step,repeat=1,number=2)
        timeit.repeat("for i in train_step():i.block_until_ready()", repeat=1, number=2,globals=locals())
        if self.jit:
            # runtimes = timeit.repeat(train_step,repeat=self.repeat,number=3)
            runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals())
        else:
            with jax.disable_jit():
                # runtimes = timeit.repeat(train_step, repeat=self.repeat, number=3)
                runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals())


        return float(np.min(runtimes)/3.0), memory
Пример #24
0
def run_gpu_mem_counter(do_shutdown=False):
    # Sum used memory for all GPUs
    if not torch.cuda.is_available(): return 0
    if do_shutdown:
        py3nvml.nvmlInit()
    devices = list(range(py3nvml.nvmlDeviceGetCount())
                   )  #if gpus_to_trace is None else gpus_to_trace
    gpu_mem = 0
    for i in devices:
        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
        meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
        gpu_mem += meminfo.used
    if do_shutdown:
        py3nvml.nvmlShutdown()
    return gpu_mem
Пример #25
0
def test_nvidia():
    # pip install py3nvml
    import py3nvml
    from py3nvml import py3nvml as nvml

    inspect(py3nvml.get_free_gpus())

    nvml.nvmlInit()
    inspect(version=nvml.nvmlSystemGetDriverVersion())
    inspect(count=nvml.nvmlDeviceGetCount())

    for i in range(nvml.nvmlDeviceGetCount()):
        test_nvidia_device(i)

    nvml.nvmlShutdown()
Пример #26
0
    def run(self):

        #
        # Initialize nvml on the thread.
        #
        nvml.nvmlInit()

        t0 = time.time()
        while not self.shutdown:
            dt = int(time.time() - t0)

            mem_used, mem_total, gpu_util = query_gpu(self.gpu_index)
            self._monitor_callback(dt, mem_used, mem_total, gpu_util)

            time.sleep(self.sampling_period)
Пример #27
0
    def _measure_memory(self, func: Callable[[],
                                             None]) -> [Memory, MemorySummary]:
        try:
            if self.args.trace_memory_line_by_line:
                trace = start_memory_tracing("transformers")

            if self.args.is_tpu:
                # tpu
                raise NotImplementedError(
                    "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no-memory` or `args.memory=False`"
                )
            elif self.args.is_gpu:
                if not is_py3nvml_available():
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                    )
                    memory = "N/A"
                else:
                    logger.info(
                        "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
                    )
                    # init nvml
                    nvml.nvmlInit()
                    func()
                    handle = nvml.nvmlDeviceGetHandleByIndex(
                        self.args.device_idx)
                    meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
                    max_bytes_in_use = meminfo.used
                    memory = Memory(max_bytes_in_use)
                    # shutdown nvml
                    nvml.nvmlShutdown()
            else:
                # cpu
                memory_bytes = measure_peak_memory_cpu(func)
                memory = Memory(memory_bytes) if isinstance(
                    memory_bytes, int) else memory_bytes

            if self.args.trace_memory_line_by_line:
                summary = stop_memory_tracing(trace)
            else:
                summary = None

            return memory, summary
        except RuntimeError as e:
            self.print_fn(f"Doesn't fit on GPU. {e}")
            return "N/A", None
Пример #28
0
def get_free_gpus(max_procs=0):
    """
    Checks the number of processes running on your GPUs.

    Parameters
    ----------
    max_procs : int
        Maximum number of procs allowed to run on a gpu for it to be considered
        'available'

    Returns
    -------
    availabilities : list(bool)
        List of length N for an N-gpu system. The nth value will be true, if the
        nth gpu had at most max_procs processes running on it. Set to 0 to look
        for gpus with no procs on it.

    Note
    ----
    If function can't query the driver will return an empty list rather than raise an
    Exception.
    """
    # Try connect with NVIDIA drivers
    logger = logging.getLogger(__name__)
    try:
        py3nvml.nvmlInit()
    except:
        str_ = """Couldn't connect to nvml drivers. Check they are installed correctly."""
        warnings.warn(str_, RuntimeWarning)
        logger.warning(str_)
        return []

    num_gpus = py3nvml.nvmlDeviceGetCount()
    gpu_free = [False] * num_gpus
    for i in range(num_gpus):
        try:
            h = py3nvml.nvmlDeviceGetHandleByIndex(i)
        except:
            continue

        procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h,
                             ['something'])
        if len(procs) <= max_procs:
            gpu_free[i] = True

    py3nvml.nvmlShutdown()
    return gpu_free
Пример #29
0
def set_affinity(rank, log_values=False):
    """
    Sets the CPU affinity for the current process to be local to its respective GPU.

    NOTE: `rank` is considered to be equivalent to the GPU index.

    Certain systems have complex hardware topologies (such as our systems in Cosmos and AVDC),
    and as such, there are certain CPU cores that have faster interconnects with certain
    GPU cards. This function will force linux to only schedule the current process to run on
    CPU cores that are fast for the associated GPU.

    Args:
        rank (int): The rank of the current process (and GPU index).
        log_values (bool): Optionally log the before and after values.
    """
    try:
        num_cpus = os.cpu_count()

        nvmlInit()

        rank_cpus = []
        # nvmlSystemGetTopologyGpuSet prints the number of GPUs each time it's
        # called, so this will suppress those prints
        with IgnorePrint():
            for i in range(num_cpus):
                for d in nvmlSystemGetTopologyGpuSet(i):
                    d_index = nvmlDeviceGetIndex(d)
                    if d_index == rank:
                        rank_cpus.append(i)
                        break

        process = psutil.Process()
        old_affinity = _dense_list_to_spans(process.cpu_affinity())

        process.cpu_affinity(rank_cpus)

        if log_values:
            new_affinity = _dense_list_to_spans(process.cpu_affinity())

            logger.info('Old CPU affinity: {}'.format(old_affinity))
            logger.info('New CPU affinity: {}'.format(new_affinity))

        nvmlShutdown()
    except Exception as e:
        logger.warning(
            "Failed to set the process affinity due to error: {}".format(e))
Пример #30
0
def main():
    args = parse_args()

    urllib3.disable_warnings()
    nvml.nvmlInit()
    atexit.register(nvml.nvmlShutdown)
    REGISTRY.register(NvidiaCollector())

    print('Starting exporter...')
    try:
        start_http_server(args['port'])
        while True:
            time.sleep(
                60)  # 1 query per minute so we don't reach API request limits
    except KeyboardInterrupt:
        print('Exiting...')
        exit(0)