def init_all_devices(): global DEVICES, DEVICE_INFO if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] DEVICE_INFO = {} log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() if ngpus==0: log.info("CUDA %s / PyCUDA %s, no devices found", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT) return DEVICES da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None devinfo = "gpu %i" % i try: device = driver.Device(i) devinfo = device_info(device) log(" + testing device %s: %s", i, devinfo) DEVICE_INFO[i] = devinfo host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", devinfo) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) try: log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) if i==0: #we print the list info "header" from inside the loop #so that the log output is bunched up together log.info("CUDA %s / PyCUDA %s, found %s device%s:", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus, engs(ngpus)) DEVICES.append(i) log.info(" + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100*free/total, SMmajor, SMminor) finally: context.pop() except Exception as e: log.error("error on device %s: %s", devinfo, e) return DEVICES
def test_multi_context(self): if drv.get_version() < (2,0,0): return if drv.get_version() >= (2,2,0): if drv.Context.get_device().compute_mode == drv.compute_mode.EXCLUSIVE: return mem_a = drv.mem_alloc(50) ctx2 = drv.Context.get_device().make_context() mem_b = drv.mem_alloc(60) del mem_a del mem_b ctx2.detach()
def get_pycuda_info(): init_all_devices() return {"version" : pycuda.VERSION, "version.text" : pycuda.VERSION_TEXT, "version.status" : pycuda.VERSION_STATUS, "driver.version" : driver.get_version(), "driver.driver_version" : driver.get_driver_version()}
def print_gpu_info(): print('CUDA version : %d.%d.%d' % cuda.get_version()) gpu0 = cuda.Device(0) ngpu = gpu0.count() gpu_list = [cuda.Device(i) for i in range(ngpu)] gpu_groups = {} for gpu in gpu_list: name = gpu.name() if not gpu_groups.has_key(name): gpu_groups[name] = {'count' : 1} gpu_groups[name]['compute capability'] = gpu.compute_capability() gpu_groups[name]['global mem size'] = gpu.total_memory() gpu_groups[name]['multiprocessor'] = \ gpu.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT) else: gpu_groups[name]['count'] += 1 for name, props in gpu_groups.items(): print('Device : %d GPU' % props['count']) print(' name: %s' % name) print(' compute capability: %d.%d' % props['compute capability']) print(' multiprocessor: %d' % props['multiprocessor']) print(' global mem size: %1.2f %s' % \ common.binary_prefix_nbytes(props['global mem size']) ) print('')
def get_info(): return { "version": pycuda.VERSION, "version.text": pycuda.VERSION_TEXT, "version.status": pycuda.VERSION_STATUS, "driver.version": driver.get_version(), "driver.driver_version": driver.get_driver_version(), }
def get_cuda_info(): init_all_devices() return { "driver" : { "version" : driver.get_version(), "driver_version" : driver.get_driver_version(), } }
def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64, alignment=4096) drv.register_host_memory(a)
def __init__(self, device=0, iterations=7, compiler_options=None): """instantiate CudaFunctions object used for interacting with the CUDA device Instantiating this object will inspect and store certain device properties at runtime, which are used during compilation and/or execution of kernels by the kernel tuner. It also maintains a reference to the most recently compiled source module for copying data to constant memory before kernel launch. :param device: Number of CUDA device to use for this context :type device: int :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ self.allocations = [] self.texrefs = [] if not drv: raise ImportError("Error: pycuda not installed, please install e.g. using 'pip install pycuda'.") drv.init() self.context = drv.Device(device).make_context() #inspect device properties devprops = {str(k): v for (k, v) in self.context.get_device().get_attributes().items()} self.max_threads = devprops['MAX_THREADS_PER_BLOCK'] cc = str(devprops.get('COMPUTE_CAPABILITY_MAJOR', '0')) + str(devprops.get('COMPUTE_CAPABILITY_MINOR', '0')) if cc == "00": cc = self.context.get_device().compute_capability() self.cc = str(cc[0])+str(cc[1]) self.iterations = iterations self.current_module = None self.compiler_options = compiler_options or [] #select PyCUDA source module if int(self.cc) >= 35: self.source_mod = DynamicSourceModule else: self.source_mod = SourceModule if not self.source_mod: raise ImportError("Error: pycuda not correctly installed, please ensure pycuda is installed on the same CUDA installation as you're using right now") #collect environment information env = dict() env["device_name"] = self.context.get_device().name() env["cuda_version"] = ".".join([str(i) for i in drv.get_version()]) env["compute_capability"] = self.cc env["iterations"] = self.iterations env["compiler_options"] = compiler_options env["device_properties"] = devprops self.env = env self.name = env["device_name"]
def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip skip("register_host_memory only exists on CUDA 4.0 and later") import sys if sys.platform == "darwin": from py.test import skip skip("register_host_memory is not supported on OS X") a = drv.aligned_empty((2**20,), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) stream = drv.Stream() drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize()
def init_all_devices(): global DEVICES if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] log("CUDA driver version=%s", driver.get_driver_version()) log.info("PyCUDA version=%s", pycuda.VERSION_TEXT) ngpus = driver.Device.count() log.info("CUDA version=%s found %s device(s):", ".".join([str(x) for x in driver.get_version()]), ngpus) da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None try: device = driver.Device(i) log(" + testing device %s: %s", i, device_info(device)) host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", device_info(device)) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) try: DEVICES.append(i) log.info(" + %s (memory %s%% free, compute %#x)", device_info(device), 100*free/total, compute) finally: context.pop() except Exception, e: log.error("error on device %s: %s", (device or i), e)
def gpu_stat(): if torch.cuda.is_available(): def pretty_bytes(bytes, precision=1): abbrevs = ((1<<50, 'PB'),(1<<40, 'TB'),(1<<30, 'GB'),(1<<20, 'MB'),(1<<10, 'kB'),(1, 'bytes')) if bytes == 1: return '1 byte' for factor, suffix in abbrevs: if bytes >= factor: break return '%.*f%s' % (precision, bytes / factor, suffix) device = autoinit.device print() print( 'GPU Name: %s' % device.name()) print( 'GPU Memory: %s' % pretty_bytes(device.total_memory())) print( 'CUDA Version: %s' % str(driver.get_version())) print( 'GPU Free/Total Memory: %d%%' % ((driver.mem_get_info()[0] /driver.mem_get_info()[1]) * 100))
def init_gl(self, width, height): super(DenseDemo, self).init_gl(width, height) import pycuda.gl.autoinit print "CUDA version: %s" % str(drv.get_version()) print "CUDA driver version: %s" % drv.get_driver_version() print "CUDA device: %s" % pycuda.gl.autoinit.device.name() print "\tCompute capability: %s" % str(pycuda.gl.autoinit.device.compute_capability()) print "\tTotal memory: %s" % pycuda.gl.autoinit.device.total_memory() self.ffusion = FreenectFusion(kc.K_ir, kc.K_rgb, kc.T, side=128) self.bbox = self.ffusion.get_bounding_box() #freenect.sync_set_led(2) # Create a texture. self.gl_rgb_texture = gl.glGenTextures(1) gl.glBindTexture(gl.GL_TEXTURE_2D, self.gl_rgb_texture) gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
def init_gl(self, width, height): super(DenseDemo, self).init_gl(width, height) import pycuda.gl.autoinit print "CUDA version: %s" % str(drv.get_version()) print "CUDA driver version: %s" % drv.get_driver_version() print "CUDA device: %s" % pycuda.gl.autoinit.device.name() print "\tCompute capability: %s" % str( pycuda.gl.autoinit.device.compute_capability()) print "\tTotal memory: %s" % pycuda.gl.autoinit.device.total_memory() self.ffusion = FreenectFusion(kc.K_ir, kc.K_rgb, kc.T, side=128) self.bbox = self.ffusion.get_bounding_box() #freenect.sync_set_led(2) # Create a texture. self.gl_rgb_texture = gl.glGenTextures(1) gl.glBindTexture(gl.GL_TEXTURE_2D, self.gl_rgb_texture) gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
def __init__(self, blocking=False, use_cache=True): self.blocking = blocking self.use_cache = use_cache self.logger = logging.getLogger(__name__) self.kernels = {} self.module_path = os.path.dirname(os.path.realpath(__file__)) #Initialize cuda (must be first call to PyCUDA) cuda.init(flags=0) self.logger.info("PyCUDA version %s", str(pycuda.VERSION_TEXT)) #Print some info about CUDA self.logger.info("CUDA version %s", str(cuda.get_version())) self.logger.info("Driver version %s", str(cuda.get_driver_version())) self.cuda_device = cuda.Device(0) self.logger.info("Using '%s' GPU", self.cuda_device.name()) self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability())) self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024)) # Create the CUDA context if (self.blocking): self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC) self.logger.warning("Using blocking context") else: self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO) self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) #Create cache dir for cubin files if (self.use_cache): self.cache_path = os.path.join(self.module_path, "cuda_cache") if not os.path.isdir(self.cache_path): os.mkdir(self.cache_path) self.logger.debug("Using CUDA cache dir %s", self.cache_path)
} } """) func = mod.get_function("double_array") func(struct_arr, block=(32, 1, 1), grid=(2, 1)) print("doubled arrays") print(array1) print(array2) func(numpy.uintp(do2_ptr), block=(32, 1, 1), grid=(1, 1)) print("doubled second only") print(array1) print(array2) if cuda.get_version() < (4, ): func.prepare("P", block=(32, 1, 1)) func.prepared_call((2, 1), struct_arr) else: func.prepare("P") block = (32, 1, 1) func.prepared_call((2, 1), block, struct_arr) print("doubled again") print(array1) print(array2) if cuda.get_version() < (4, ): func.prepared_call((1, 1), do2_ptr) else:
# # Example based on dnorm from RCUDA # Timing code from http://wiki.tiker.net/PyCuda/Examples/SimpleSpeedTest # import pycuda.autoinit import pycuda.driver as drv import numpy as np import scipy as sp from scipy.stats import norm from pycuda.compiler import SourceModule # Versions: drv.get_version() drv.get_driver_version() m = SourceModule(""" #include <stdio.h> __global__ void dnorm_kernel(float *vals, float *x, int N, float mu, float sigma, int dbg) { int myblock = blockIdx.x; // 1D-grid int blocksize = blockDim.x; // 1D-block int subthread = threadIdx.x; int idx = myblock * blocksize + subthread; if (idx < N) { if (dbg){ printf("thread idx: %04d\\t x[%d] = %f\\t (n=%d,mu=%f,sigma=%f)\\n",idx,idx,x[idx],N,mu,sigma); } float std = (x[idx] - mu)/sigma; float e = exp( - 0.5 * std * std); vals[idx] = e / ( sigma * sqrt(2 * 3.141592653589793));
#!/usr/bin/env python from __future__ import division, print_function, absolute_import import functools import numpy as np import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as drv from cuda_cffi.misc import init toolkit_version = drv.get_version() if toolkit_version < (7, 0, 0): raise ImportError("cuSOLVER not present prior to v7.0 of the CUDA toolkit") """ Python interface to cuSOLVER functions. Note: You may need to set the environment variable CUDA_ROOT to the base of your CUDA installation. """ # import low level cuSOLVER python wrappers and constants try: from cuda_cffi._cusolver_cffi import * except Exception as e: print(repr(e)) estr = "autogenerattion and import of cuSOLVER wrappers failed\n" estr += ("Try setting the CUDA_ROOT environment variable to the base of " "your CUDA installation. The autogeneration script tries to " "find the CUSOLVER headers in CUDA_ROOT/include/\n")
#from brian.experimental.ccodegen import AutoCompiledNonlinearStateUpdater set_global_preferences(usecodegen=False) #duration = 10*second #N = 1000 #domonitor = False duration = 1000 * ms N = 100 domonitor = False showfinal = False forcesync = True method = 'gpu' # methods are 'c', 'python' and 'gpu' if drv.get_version() == (2, 0, 0): # cuda version precision = 'float' elif drv.get_version() > (2, 0, 0): precision = 'double' else: raise Exception, "CUDA 2.0 required" #precision = 'float' import buffering buffering.DEBUG_BUFFER_CACHE = False # eqs = Equations(''' # #dV/dt = -V*V/(10*ms) : 1 # dV/dt = cos(2*pi*t/(100*ms))/(10*ms) : 1 # #dV/dt = -V*V*V*V*V/(100*ms) : 1 # #dW/dt = -W*W*W*W*W/(100*ms) : 1 # #dV/dt = cos(2*pi*t/(100*ms))/(10*ms) : 1
def log_sys_info(): log.info("PyCUDA version=%s", ".".join([str(x) for x in driver.get_version()])) log.info("PyCUDA driver version=%s", driver.get_driver_version())