def calcV(I_shape, I_cu, V_cu): #Ifull = I Ci = I_shape[0] iH = I_shape[1] iW = I_shape[2] N = I_shape[3] tiles = iW // 4 oH = iH oW = iW padH = 1 padW = 1 # adapted from winograd_conv.py #if N == 1: # shlN = 0 #elif N < 32: # shlN = len(bin(N-1))-2 #else: # shlN = 5 shlN = 5 shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = { 0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8 yyxxx 1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4 yyxxn 2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4 yxxnn 3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4 xxnnn 4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2 xnnnn 5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1 nnnnn }.get(shlN) GYS = ceil_div(oH, 1 << shlY) GXS = ceil_div(oW, 1 << shlX) GN = ceil_div(N, 1 << shlN) # GK = ceil_div(Co, 32) GYS2 = GYS // 2 GXS2 = GXS * 2 div_GXS2 = get_div_mul_shift_32(GXS * GYS, GXS2) div_GXS = get_div_mul_shift_32(GXS * GYS, GXS) image_size = 1152*Ci*GXS*GYS*GN print('div_GXS', div_GXS) print('GYS', GYS, 'GXS', GXS, 'GN', GN, 'Ci', Ci, 'GY_GX', GXS * GYS) grid = (GN, GYS*GXS, Ci) block = (32, 1, 1) call_cu_kernel( k_calcV, grid, block, V_cu, I_cu, iH, iW, N, padH, padW, GXS, GYS2, GXS2, div_GXS2[0], div_GXS2[1], div_GXS[0], div_GXS[1], shlY, shlX, maskY, shrY, maskX, shrX, shlN, maskN, iH * iW * N, iW * N, GYS*GXS*Ci*1152, GXS * Ci * 1152, Ci * 1152, GXS, GXS * GYS, GN, Ci) Context.synchronize() timecheck('calced V_cu')
def gpu_filter(pixels, width, height): size = np.array([width, height]) filtered = np.zeros_like(pixels) grid_dim = (width // BLOCK_SIZE, height // BLOCK_SIZE) median_filter(In(pixels), Out(filtered), In(size), block=BLOCK, grid=grid_dim) Context.synchronize() return filtered
def init_the_device_if_needed(do_it_anyway=False): if do_it_anyway: print 'import pycuda.autoinit' import pycuda.autoinit return try: Context.get_device() except: # Presumably, the line above failed because of something like that: # "LogicError: cuCtxGetDevice failed: not initialized" # -- initialize the device print 'import pycuda.autoinit' import pycuda.autoinit
def init_the_device_if_needed(do_it_anyway=False): if do_it_anyway: print('import pycuda.autoinit') import pycuda.autoinit return try: Context.get_device() except: # Presumably, the line above failed because of something like that: # "LogicError: cuCtxGetDevice failed: not initialized" # -- initialize the device print('import pycuda.autoinit') import pycuda.autoinit
def mem_alloc(nbytes): """Allocates device memory of given size from memory pool. This function chooses memory pool corresponding to the current device. Args: nbytes (int): The size of memory in bytes. Returns: pycuda.tools.PooledDeviceAllocation: Allocated memory with additional ``device`` attribute. This attribute is used to determine on which GPU the memory resides. """ global _pools device = Context.get_device() pool = _pools.get(device, None) if pool is None: pool = drv.DeviceMemoryPool() _pools[device] = pool allocation = pool.allocate(nbytes) setattr(allocation, 'device', device) return allocation
def get_device(arg=None): """Gets the device from ID ''arg'' or given chainer's :class:`~pycuda.gpuarray.GPUArray`. Args: arg: Value to specify a GPU device. Returns: Device object specified by given ``arg``. The rule of device selection is following. ==================================== ===================================== Type of ``arg`` Return value ==================================== ===================================== ``None`` Current device ``int`` Device of ID ``arg`` :class:`~pycuda.driver.Device` ``arg`` :class:`~pycuda.gpuarray.GPUArray` Device given array was allocated on :class:`~numpy.ndarray` ``None`` ==================================== ===================================== """ if arg is None: return Context.get_device() elif isinstance(arg, Device): return arg elif isinstance(arg, numpy.ndarray): return None elif isinstance(arg, GPUArray): while not hasattr(arg.gpudata, 'device'): arg = arg.base return arg.gpudata.device return drv.Device(arg)
def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], target="cubin"): assert target in ["cubin", "ptx", "fatbin"] if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: from pycuda.driver import Error try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except Error: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join import appdirs cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1") from os import makedirs try: makedirs(cache_dir) except OSError as e: from errno import EEXIST if e.errno != EEXIST: raise if arch is not None: options.extend(["-arch", arch]) if code is not None: options.extend(["-code", code]) if 'darwin' in sys.platform and sys.maxint == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 2147483647: options.append('-m32') include_dirs = include_dirs + [_find_pycuda_include_path()] for i in include_dirs: options.append("-I"+i) return compile_plain(source, options, keep, nvcc, cache_dir, target)
def compile(source, nvcc="nvcc", options=[], keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source options = options[:] if arch is None: try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except RuntimeError: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join from tempfile import gettempdir cache_dir = join(gettempdir(), "pycuda-compiler-cache-v1-%s" % _get_per_user_string()) from os import mkdir try: mkdir(cache_dir) except OSError, e: from errno import EEXIST if e.errno != EEXIST: raise
def __init__(self, nvcc='nvcc', link_options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], message_handler=None, log_verbose=False, cuda_libdir=None): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() if compute_capability < (3, 5): raise Exception( 'Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) else: from pycuda.driver import Linker self.linker = Linker(message_handler, link_options, log_verbose) self._check_arch(arch) self.nvcc = nvcc self.keep = keep self.no_extern_c = no_extern_c self.arch = arch self.code = code self.cache_dir = cache_dir self.include_dirs = include_dirs self.cuda_libdir = cuda_libdir self.libdir, self.libptn = None, None self.module = None
def calcO(O_cu, M_shape, M_cu): GK = M_shape[2] GN = M_shape[0] tiles = M_shape[4] num_xinu_tiles = GK * 32 * GN * 32 * tiles * tiles grid = (ceil_div(num_xinu_tiles, 32), 1, 1) block = (32, 1, 1) call_cu_kernel( k_calcO, grid, block, O_cu, M_cu, num_xinu_tiles ) Context.synchronize() timecheck('calced O_cu')
def init(device=None): """Initializes CUDA global state. Chainer maintains CUDA context, CUBLAS context, random number generator and device memory pool for each GPU device and for each process (the main process or a process forked by :mod:`multiprocessing`) as global states. When called for the first time on the process, this function initializes these global states. .. warning:: This function also initializes PyCUDA and scikits.cuda. Since these packages do not support forking after initialization, do not call this function before forking the process. This function also registers :func:`shutdown` to :mod:`atexit` slot. It also initializes random number generator. User can set fixed seed with ``CHAINER_SEED`` environment variable. Args: device (``int`` or :class:`~pycuda.driver.Device` or ``None``): Device ID to initialize on. """ global _contexts, _cublas_handles, _generators, _pid, _pools if not available: global _import_error raise RuntimeError( 'CUDA environment is not correctly set up. ' + 'The original import error said: ' + str(_import_error)) pid = os.getpid() if _pid == pid: # already initialized return drv.init() if device is None: # use default device context = cutools.make_default_context() device = Context.get_device() else: device = Device(device) context = device.make_context() _contexts = {device: context} _generators = {} _pools = {} _cublas_handles = {} cumisc.init(mem_alloc) seed(os.environ.get('CHAINER_SEED')) _pid = pid # mark as initialized atexit.register(shutdown)
def ensure_pycuda_context(): global pycuda_context, pycuda_initialized if not pycuda_initialized: if Context is None: raise RuntimeError("PyCUDA not found or too old.") else: pycuda_context = Context.attach() import atexit atexit.register(pycuda_context.detach) pycuda_initialized = True return pycuda_context
def _check_arch(self, arch): if arch is None: return try: from pycuda.driver import Context capability = Context.get_device().compute_capability() if tuple(map(int, tuple(arch.split("_")[1]))) > capability: from warnings import warn warn("trying to compile for a compute capability " "higher than selected GPU") except: pass
def calcM(N, Co, M_cu, U_shape, U_cu, V_shape, V_cu): Co = (U_shape[2] - 1) * 32 + U_shape[4] Ci = U_shape[3] GK = ceil_div(Co, 32) tiles = V_shape[4] GN = V_shape[2] print('GK', GK, 'GN', GN, 'tiles', tiles, 'Co', Co, 'Ci', Ci, 'N', N) grid = (tiles * tiles,1,1) # b block = (32, 16, 1) # 16 for intel... call_cu_kernel( k_calcM, grid, block, M_cu, U_cu, V_cu, Ci, 1, tiles, GN, GK) #, # cl.LocalMemory(32 * 32 * 4), cl.LocalMemory(32 * 32 * 4)) Context.synchronize() timecheck('calced M_cu')
def calcU(W_shape, W_cu, U_cu): Ci = W_shape[0] kH = W_shape[1] kW = W_shape[2] Co = W_shape[3] # this is adapted from neon's winograd_conv.py: GK = ceil_div(Co, 32) filter_size = 1152*Ci*GK grid = (GK, Ci, 1) block = (32, 1, 1) call_cu_kernel( k_calcU, grid, block, U_cu, W_cu, kH * kW * Co, kW * Co, kW * Co * 2, Co, Ci * 1152, Ci, GK) Context.synchronize() timecheck('calced U_cu')
def _init_thread_memory(self, dev_id: int, ctx: cuda.Context, alloc_size: int) -> None: ''' Single thread that initializes the memory for all the stream for a single GPU. ''' ctx.push() size_per_batch = np.int32(np.ceil(alloc_size / self.num_stream)) # Initialize streams for i in range(self.num_stream): self.streams[dev_id].append(cuda.Stream()) for i in range(0, self.num_stream, 1): # allocate memory on device self.moments_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 10)))) self.w_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) self.x_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) self.y_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) # set host memory for returned output self.c_moments[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 7)))) self.mu[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.yf[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.m1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 5)))) self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(0), size_per_batch, size_per_batch, block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i]) self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(1), size_per_batch, np.int32(0), block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i]) self.x1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.w1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.x2[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.w2[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) ctx.synchronize() ctx.pop()
def get_cublas_handle(): """Gets CUBLAS handle for the current device. Returns: CUBLAS handle. """ global _cublas_handles device = Context.get_device() if device in _cublas_handles: return _cublas_handles[device] handle = cublas.cublasCreate() _cublas_handles[device] = handle return handle
def _set_thread_args(self, dev_id: int, ctx: cuda.Context, moment: np.ndarray, w_out: np.ndarray, x_out: np.ndarray, y_out: np.ndarray): ''' Set the input moment for all the stream for a specific GPU ''' ctx.push() # number of input for this GPU max_size = moment.shape[1] # loop through the streams to set their input for i in range(0, self.num_stream, 1): # Size of input allocated for each stream size_per_batch = int(np.ceil(max_size / self.num_stream)) # location on the original input array where the input to this stream starts loc = np.int32((i) * size_per_batch) if loc + size_per_batch > max_size: size_per_batch = max_size - loc self.moment_chunk_host[dev_id].append( np.ascontiguousarray(moment[:, loc:loc + size_per_batch], dtype=np.float32)) self.moment_chunk_host[dev_id][i] = cuda.register_host_memory( self.moment_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.w_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(w_out[:, loc:loc + size_per_batch]))) self.w_chunk_host[dev_id][i] = cuda.register_host_memory( self.w_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.x_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(x_out[:, loc:loc + size_per_batch]))) self.x_chunk_host[dev_id][i] = cuda.register_host_memory( self.x_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.y_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(y_out[:, loc:loc + size_per_batch]))) self.y_chunk_host[dev_id][i] = cuda.register_host_memory( self.y_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) ctx.synchronize() ctx.pop()
def __init__(self, nvcc='nvcc', link_options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], message_handler=None, log_verbose=False, cuda_libdir=None): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() if compute_capability < (3,5): raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) else: from pycuda.driver import Linker self.linker = Linker(message_handler, link_options, log_verbose) self._check_arch(arch) self.nvcc = nvcc self.keep = keep self.no_extern_c = no_extern_c self.arch = arch self.code = code self.cache_dir = cache_dir self.include_dirs = include_dirs self.cuda_libdir = cuda_libdir self.libdir, self.libptn = None, None self.module = None
def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except RuntimeError: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join import appdirs cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1") from os import makedirs try: makedirs(cache_dir) except OSError, e: from errno import EEXIST if e.errno != EEXIST: raise
def has_double_support(): from pycuda.driver import Context return Context.get_device().compute_capability() >= (1, 3)
def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except RuntimeError: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if cache_dir is None: from os.path import join from tempfile import gettempdir cache_dir = join( gettempdir(), "pycuda-compiler-cache-v1-%s" % _get_per_user_string()) from os import mkdir try: mkdir(cache_dir) except OSError as e: from errno import EEXIST if e.errno != EEXIST: raise if arch is not None: options.extend(["-arch", arch]) if code is not None: options.extend(["-code", code]) if 'darwin' in sys.platform and sys.maxint == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 2147483647: options.append('-m32') include_dirs = include_dirs + [_find_pycuda_include_path()] for i in include_dirs: options.append("-I" + i) return compile_plain(source, options, keep, nvcc, cache_dir)
def compile(source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], target="cubin"): assert target in ["cubin", "ptx", "fatbin"] if not no_extern_c: source = 'extern "C" {\n%s\n}\n' % source if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] if arch is None: from pycuda.driver import Error try: from pycuda.driver import Context arch = "sm_%d%d" % Context.get_device().compute_capability() except Error: pass from pycuda.driver import CUDA_DEBUGGING if CUDA_DEBUGGING: cache_dir = False keep = True options.extend(["-g", "-G"]) if "PYCUDA_CACHE_DIR" in os.environ and cache_dir is None: cache_dir = os.environ["PYCUDA_CACHE_DIR"] if "PYCUDA_DISABLE_CACHE" in os.environ: cache_dir = False if cache_dir is None: from os.path import join import appdirs cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1") from os import makedirs try: makedirs(cache_dir) except OSError as e: from errno import EEXIST if e.errno != EEXIST: raise if arch is not None: options.extend(["-arch", arch]) if code is not None: options.extend(["-code", code]) if 'darwin' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807: options.append('-m64') elif 'win32' in sys.platform and sys.maxsize == 2147483647: options.append('-m32') include_dirs = include_dirs + [_find_pycuda_include_path()] for i in include_dirs: options.append("-I" + i) return compile_plain(source, options, keep, nvcc, cache_dir, target)
def has_stack(): from pycuda.driver import Context return Context.get_device().compute_capability() >= (2, 0)
def process(iH, iW, N, Ci, Co, kH=3, kW=3): inittime() np.random.seed(123) oH = iH oW = iW tiles = iW // 4 shlN = 5 shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = { 0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8 yyxxx 1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4 yyxxn 2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4 yxxnn 3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4 xxnnn 4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2 xnnnn 5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1 nnnnn }.get(shlN) GYS = ceil_div(oH, 1 << shlY) GXS = ceil_div(oW, 1 << shlX) GN = ceil_div(N, 1 << shlN) # GK = ceil_div(Co, 32) GYS2 = GYS // 2 GXS2 = GXS * 2 GK = ceil_div(Co, 32) W = np.random.randn(Ci,kH,kW,Co).astype(np.float32) I = np.zeros((Ci,iH, iW,N), dtype=np.float32) I[:] = np.random.randn(*I.shape) print('Co', Co, 'iH', iH, 'iW', iW, 'N', N, 'tiles', tiles) W_cu = gpuarray.to_gpu(W) I_cu = gpuarray.to_gpu(I) U = np.zeros((6, 6, GK, Ci, 32,), dtype=np.float32) U_cu = gpuarray.to_gpu(U) V = np.zeros((6, 6, GN,GXS, GYS, Ci, 32), dtype=np.float32) V_cu = gpuarray.to_gpu(V) M = np.zeros((GN, 32, GK, 32, tiles, tiles, 6, 6,), dtype=np.float32) M_cu = gpuarray.to_gpu(M) O = np.zeros((GN, 32, GK, 32, tiles, tiles, 4, 4,), dtype=np.float32) O_cu = gpuarray.to_gpu(O) Context.synchronize() print('allocated buffers') start = time.time() for it in range(3): calcU(U_cu=U_cu, W_shape=W.shape, W_cu=W_cu) calcV(V_cu=V_cu, I_shape=I.shape, I_cu=I_cu) calcM(N=N, Co=Co, M_cu=M_cu, U_shape=U.shape, U_cu=U_cu, V_shape=V.shape, V_cu=V_cu) calcO(O_cu=O_cu, M_shape=M.shape, M_cu=M_cu) Context.synchronize() end = time.time() print('calcs done') print('time for all calcs:', end - start) start = time.time() O = O_cu.get() # cl.enqueue_copy(q, O, O_cu) O = O.transpose(2,3, 4,6, 5,7, 0,1).reshape( GK * 32, tiles * 4, tiles * 4, GN * 32) print('O.shape', O.shape) W_from_cu = np.zeros((Ci, 3, 3, Co), dtype=np.float32) W_from_cu = W_cu.get() U_from_cpu = winograd_cpu.calcU(W=W) U_from_cu = np.zeros((6, 6, GK, Ci, 32), dtype=np.float32) U_from_cu = U_cu.get() U_from_cu_ = U_from_cu.transpose( 0, 1, 2, 4, 3).reshape(6, 6, GK * 32, Ci)[:, :, :Co] assert np.allclose(U_from_cu_, U_from_cpu, atol=1e-4) V_from_cpu = winograd_cpu.calcV(I=I) V_from_cu = np.copy(V) V_from_cu = V_cu.get() print('tiles', tiles) # 0 1 2 3 4 5 6 # 6, 6, GN,GXS, GYS, Ci, 32 V_from_cu_ = V_from_cu.transpose( 2,6,0,1,5,3,4).reshape( GN * 32, 6, 6, Ci, tiles, tiles)[:N] assert np.allclose(V_from_cu_, V_from_cpu, atol=1e-3) # 0 1 2 3 4 5 6 7 # [n//32][n % 32][co // 32][co % 32][th][tw][xi][nu] M_from_cpu = winograd_cpu.calcM(U=U_from_cu, V=V_from_cu, N=N, Co=Co) M_from_cu = np.copy(M) M_from_cu = M_cu.get() Context.synchronize() M_from_cu = M_from_cu.reshape(GN * 32, GK * 32, tiles, tiles, 6, 6)[:N, :Co] print(M_from_cu.reshape(M_from_cu.size)[:20]) assert np.allclose(M_from_cu, M_from_cpu, atol=1e-2) #np.transpose(V_from_cu, [2, 6, 4, 5, 3, 0, 1]) #V_from_cu = V_from_cu.reshape(GN * 32, 6, 6, Ci, tiles, tiles)[:N,:,:,:,:,:] return {'W': W, 'O': O, 'I': I}
Ts[i_20]+=An[8]/my_factorial; Ts[i_21]+=An[9]/my_factorial; Ts[i_22]+=An[10]/my_factorial; Ts[i_23]+=An[11]/my_factorial; } } } """ try: Context.get_device() except: import pycuda.autoinit mod = SourceModule(krnl, no_extern_c=True) _gpu_expm = mod.get_function("expm") def gpu_expm(As, Ts_vectorized, p=12): N = len(As) if Ts_vectorized.ndim != 2 or Ts_vectorized.shape[1] != 12: raise ValueError(Ts_vectorized.shape) # threadsPerBlock=1024 # Regardless of the value of N, # for some reasons this gives errors, # (only) on the machines with the good graphics
#!/usr/bin/env python """ Created on Wed Sep 3 11:08:37 2014 Author: Oren Freifeld Email: [email protected] """ from pycuda.compiler import SourceModule from pycuda.driver import Context try: Context.get_device() except: import pycuda.autoinit class KernelThinWrapper(object): def __init__(self, gpu_kernel, include_dirs=[]): self._gpu_kernel = gpu_kernel self._src_module = SourceModule(gpu_kernel, include_dirs=include_dirs) def _get_function_from_src_module(self, func_name): self.__dict__["_gpu_" + func_name] = self._src_module.get_function(func_name) def __call__(self, *args, **kwargs): msg = """ You need to customize this method in the derived class. The customized method will usually have 3 parts: