def __init__(self, device): self.api_id = get_id() self._device = device self.max_work_group_size = device.max_threads_per_block self.max_work_item_sizes = [ device.max_block_dim_x, device.max_block_dim_y, device.max_block_dim_z ] self.max_num_groups = [ device.max_grid_dim_x, device.max_grid_dim_y, device.max_grid_dim_z ] # there is no corresponding constant in the API at the moment self.local_mem_banks = 16 if device.compute_capability()[0] < 2 else 32 self.warp_size = device.warp_size devdata = DeviceData(device) self.min_mem_coalesce_width = dict( ((size, devdata.align_words(word_size=size)) for size in [4, 8, 16])) self.local_mem_size = device.max_shared_memory_per_block self.compute_units = device.multiprocessor_count
def __init__(self, device): self.api_id = get_id() self._device = device self.max_work_group_size = device.max_threads_per_block self.max_work_item_sizes = [ device.max_block_dim_x, device.max_block_dim_y, device.max_block_dim_z] self.max_num_groups = [ device.max_grid_dim_x, device.max_grid_dim_y, device.max_grid_dim_z] # there is no corresponding constant in the API at the moment self.local_mem_banks = 16 if device.compute_capability()[0] < 2 else 32 self.warp_size = device.warp_size devdata = DeviceData(device) self.min_mem_coalesce_width = dict( ((size,devdata.align_words(word_size=size)) for size in [4, 8, 16])) self.local_mem_size = device.max_shared_memory_per_block self.compute_units = device.multiprocessor_count
def __init__(self, device, stream, mempool): self._stream = stream self._recreate_stream = stream is None devdata = DeviceData(device) self.min_mem_coalesce_width = {} for size in [4, 8, 16]: self.min_mem_coalesce_width[size] = devdata.align_words( word_size=size) self.num_smem_banks = devdata.smem_granularity self.max_registers = device.get_attribute( device_attribute.MAX_REGISTERS_PER_BLOCK) self.max_grid_x = 2**log2( device.get_attribute(device_attribute.MAX_GRID_DIM_X)) self.max_grid_y = 2**log2( device.get_attribute(device_attribute.MAX_GRID_DIM_Y)) self.max_block_size = device.get_attribute( device_attribute.MAX_BLOCK_DIM_X) self.max_shared_mem = device.get_attribute( device_attribute.MAX_SHARED_MEMORY_PER_BLOCK) if mempool is None: self.allocate = cuda.mem_alloc else: self._mempool = mempool self.allocate = mempool.allocate
def _splay_backend(n, dev): # heavily modified from cublas from pycuda.tools import DeviceData devdata = DeviceData(dev) min_threads = devdata.warp_size max_threads = 128 max_blocks = 4 * devdata.thread_blocks_per_mp \ * dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT) if n < min_threads: block_count = 1 threads_per_block = min_threads elif n < (max_blocks * min_threads): block_count = (n + min_threads - 1) // min_threads threads_per_block = min_threads elif n < (max_blocks * max_threads): block_count = max_blocks grp = (n + min_threads - 1) // min_threads threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads else: block_count = max_blocks threads_per_block = max_threads # print "n:%d bc:%d tpb:%d" % (n, block_count, threads_per_block) return (block_count, 1), (threads_per_block, 1, 1)
def setDevice(ndev=None): ''' To use CUDA or OpenCL you need a context and a device to stablish the context o communication ''' cuda.init() nDevices = cuda.Device.count() print "Available Devices:" for i in range(nDevices): dev = cuda.Device(i) try: mem = cuda.mem_get_info()[-i - 1] except: mem = 0 print " Device {0}: {1}, Total (MB) {2:.1f}, Free (MB) {3:.1f}".format( i, dev.name(), dev.total_memory() / 2.**20, mem / 2.**20) #mem/2.**20 ) devNumber = 0 if nDevices > 1: if ndev == None: devNumber = int(raw_input("Select device number: ")) else: devNumber = ndev dev = cuda.Device(devNumber) #cuda.Context.pop() #Disable previus CUDA context ctxCUDA = dev.make_context() devdata = DeviceData(dev) print "Using device {0}: {1}".format(devNumber, dev.name()) return ctxCUDA, dev, devdata
def __init__(self, mat, dtype): self.dtype = np.dtype(dtype) self.index_dtype = np.dtype(np.int32) self.shape = mat.shape self.block_size = 128 from scipy.sparse import coo_matrix coo_mat = coo_matrix(mat, dtype=self.dtype) self.row_gpu = gpuarray.to_gpu(coo_mat.row.astype(self.index_dtype)) self.col_gpu = gpuarray.to_gpu(coo_mat.col.astype(self.index_dtype)) self.data_gpu = gpuarray.to_gpu(coo_mat.data) self.nnz = coo_mat.nnz from pycuda.tools import DeviceData dev = drv.Context.get_device() devdata = DeviceData() max_threads = (devdata.warps_per_mp * devdata.warp_size * dev.multiprocessor_count) max_blocks = 4 * max_threads // self.block_size warps_per_block = self.block_size // dev.warp_size if self.nnz: def divide_into(x, y): return (x + y - 1) // y num_units = self.nnz // dev.warp_size num_warps = min(num_units, warps_per_block * max_blocks) self.num_blocks = divide_into(num_warps, warps_per_block) num_iters = divide_into(num_units, num_warps) self.interval_size = dev.warp_size * num_iters self.tail = num_units * dev.warp_size
def orinfo(n): orec = OccupancyRecord(DeviceData(), n) return """occupancy record information thread blocks per multiprocessor - %d warps per multiprocessor - %d limited by - %s occupancy - %f """ % (orec.tb_per_mp, orec.warps_per_mp, orec.limited_by, orec.occupancy)
def make_GPU_gradient(mesh, context): '''Prepare to compute gradient on the GPU w.r.t. the given mesh. Return gradient function. ''' mx = int(getattr(mesh, 'nx', 1)) my = int(getattr(mesh, 'ny', 1)) mz = int(getattr(mesh, 'nz', 1)) dxInv = np.array(1./getattr(mesh, 'dx', 1), dtype=np.float64) dyInv = np.array(1./getattr(mesh, 'dy', 1), dtype=np.float64) dzInv = np.array(1./getattr(mesh, 'dz', 1), dtype=np.float64) sizeof_double = 8 with open(where + 'gradient2.cu') as fdlib: source = fdlib.read() module = SourceModule(source) mx_ptr = module.get_global("mx")[0] my_ptr = module.get_global("my")[0] mz_ptr = module.get_global("mz")[0] cuda.memcpy_htod(mx_ptr, np.array(mx, dtype=np.int32)) cuda.memcpy_htod(my_ptr, np.array(my, dtype=np.int32)) cuda.memcpy_htod(mz_ptr, np.array(mz, dtype=np.int32)) dxInv_ptr = module.get_global("dxInv")[0] dyInv_ptr = module.get_global("dyInv")[0] dzInv_ptr = module.get_global("dzInv")[0] cuda.memcpy_htod(dxInv_ptr, dxInv) cuda.memcpy_htod(dyInv_ptr, dyInv) cuda.memcpy_htod(dzInv_ptr, dzInv) deriv_x = module.get_function("gradient_x") deriv_y = module.get_function("gradient_y") deriv_z = module.get_function("gradient_z") block, grid = mesh.get_domain_decomposition(DeviceData().max_threads) d_deriv_x = gpuarray.empty(shape=(1, mesh.n_nodes), dtype=np.float64) d_deriv_y = gpuarray.empty_like(d_deriv_x) d_deriv_z = gpuarray.empty_like(d_deriv_x) def _gradient(scalar_values): '''Calculate three-dimensional gradient for GPUArray scalar_values. ''' deriv_x(scalar_values, d_deriv_x, block=block, grid=grid) deriv_y(scalar_values, d_deriv_y, block=block, grid=grid) deriv_z(scalar_values, d_deriv_z, block=block, grid=grid) context.synchronize() return (d_deriv_x, d_deriv_y, d_deriv_z)[:mesh.dimension] return _gradient
def __init__(self, device, stream, mempool): self._stream = stream self._recreate_stream = stream is None devdata = DeviceData(device) self.min_mem_coalesce_width = {} for size in [4, 8, 16]: self.min_mem_coalesce_width[size] = devdata.align_words(word_size=size) self.num_smem_banks = devdata.smem_granularity self.max_registers = device.get_attribute(device_attribute.MAX_REGISTERS_PER_BLOCK) self.max_grid_x = 2 ** log2(device.get_attribute(device_attribute.MAX_GRID_DIM_X)) self.max_grid_y = 2 ** log2(device.get_attribute(device_attribute.MAX_GRID_DIM_Y)) self.max_block_size = device.get_attribute(device_attribute.MAX_BLOCK_DIM_X) self.max_shared_mem = device.get_attribute(device_attribute.MAX_SHARED_MEMORY_PER_BLOCK) if mempool is None: self.allocate = cuda.mem_alloc else: self._mempool = mempool self.allocate = mempool.allocate
def _compile_gpu(self): codegen = CudaKernelGenerator(self) # verify all gpuarray have equal length num = [len(self[key]) for key in codegen.args] assert num.count(num[0]) == len(num), "Variables have unequal lenth." num = num[0] try: mod = SourceModule(codegen.src, options = ["--ptxas-options=-v"]) kernel = mod.get_function(self.__class__.__name__) kernel.prepare(codegen.arg_ctype) except: for i, line in enumerate(codegen.src.split('\n')): print("{}: {}".format(i, line)) raise deviceData = DeviceData() maxThreads = int(np.float(deviceData.registers // kernel.num_regs)) maxThreads = int(2**int(np.log(maxThreads) / np.log(2))) threadsPerBlock = int(min(256, maxThreads, deviceData.max_threads)) numBlocks = (num-1) / threadsPerBlock + 1 deviceNumBlocks = 6 * drv.Context.get_device().MULTIPROCESSOR_COUNT block = (threadsPerBlock, 1, 1) grid = (int(min(numBlocks,deviceNumBlocks)), 1) address = [self[key].gpudata for key in codegen.args] self.gpu = SimpleNamespace( args=codegen.args, arg_address=address, arg_ctype=codegen.arg_ctype, block=block, grid=grid, kernel=kernel, num_thread=num, src=codegen.src) self.update = self._update_gpu
def __init__(self, mat, is_symmetric, dtype): from pycuda.tools import DeviceData devdata = DeviceData() # all row indices in the data structure generation code are # "unpermuted" unless otherwise specified self.dtype = np.dtype(dtype) self.index_dtype = np.int32 self.packed_index_dtype = np.uint32 self.threads_per_packet = devdata.max_threads h, w = self.shape = mat.shape if h != w: raise ValueError("only square matrices are supported") self.rows_per_packet = (devdata.shared_memory - 100) // (2 * self.dtype.itemsize) self.block_count = (h + self.rows_per_packet - 1) // self.rows_per_packet # get metis partition ------------------------------------------------- from scipy.sparse import csr_matrix csr_mat = csr_matrix(mat, dtype=self.dtype) from pymetis import part_graph if not is_symmetric: # make sure adjacency graph is undirected adj_mat = csr_mat + csr_mat.T else: adj_mat = csr_mat while True: cut_count, dof_to_packet_nr = part_graph(int(self.block_count), xadj=adj_mat.indptr, adjncy=adj_mat.indices) # build packet_nr_to_dofs packet_nr_to_dofs = {} for i, packet_nr in enumerate(dof_to_packet_nr): try: dof_packet = packet_nr_to_dofs[packet_nr] except KeyError: packet_nr_to_dofs[packet_nr] = dof_packet = [] dof_packet.append(i) packet_nr_to_dofs = [ packet_nr_to_dofs.get(i) for i in range(len(packet_nr_to_dofs)) ] too_big = False for packet_dofs in packet_nr_to_dofs: if len(packet_dofs) >= self.rows_per_packet: too_big = True break if too_big: old_block_count = self.block_count self.block_count = int(2 + 1.05 * self.block_count) print(("Metis produced a big block at block count " "%d--retrying with %d" % (old_block_count, self.block_count))) continue break assert len(packet_nr_to_dofs) == self.block_count # permutations, base rows --------------------------------------------- ( new2old_fetch_indices, old2new_fetch_indices, packet_base_rows, ) = self.find_simple_index_stuff(packet_nr_to_dofs) # find local row cost and remaining_coo ------------------------------- local_row_costs, remaining_coo = self.find_local_row_costs_and_remaining_coo( csr_mat, dof_to_packet_nr, old2new_fetch_indices) local_nnz = np.sum(local_row_costs) assert remaining_coo.nnz == csr_mat.nnz - local_nnz # find thread assignment for each block ------------------------------- thread_count = len(packet_nr_to_dofs) * self.threads_per_packet thread_assignments, thread_costs = self.find_thread_assignment( packet_nr_to_dofs, local_row_costs, thread_count) max_thread_costs = np.max(thread_costs) # build data structure ------------------------------------------------ from .pkt_build import build_pkt_data_structure build_pkt_data_structure( self, packet_nr_to_dofs, max_thread_costs, old2new_fetch_indices, csr_mat, thread_count, thread_assignments, local_row_costs, ) self.packet_base_rows = gpuarray.to_gpu(packet_base_rows) self.new2old_fetch_indices = gpuarray.to_gpu(new2old_fetch_indices) self.old2new_fetch_indices = gpuarray.to_gpu(old2new_fetch_indices) from .coordinate import CoordinateSpMV self.remaining_coo_gpu = CoordinateSpMV(remaining_coo, dtype)
from pycuda import gpuarray as gu from pycuda import driver from pycuda.cumath import fabs from pycuda.tools import DeviceData from pytom.tompy.tools import paste_in_center from typing import Union, Tuple from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel from pycuda.compiler import SourceModule from skcuda.misc import sum, max, min from skcuda.fft import fft, ifft, Plan from skcuda.linalg import conj max_threads = DeviceData().max_threads cc_mod = SourceModule(""" __global__ void update_scores_angles(float *scores, float *angles, float *ccc_map, float angleId, int num_elements, int dimx) { const int idx = (threadIdx.x + blockIdx.x*dimx)*dimx; for (int i=0; i < dimx; i++) { if (idx +i < num_elements){ if (scores[idx+i] < ccc_map[idx+i]) { scores[idx+i] = ccc_map[idx+i]; angles[idx+i] = angleId; } } }
'gridDim.z' ] upVar = [ str(cuB[0]), str(cuB[1]), str(cuB[2]), str(cuG[0]), str(cuG[1]), str(cuG[2]) ] dicVarOptim = dict(zip(downVar, upVar)) for i in downVar: kFile = kFile.replace(i, dicVarOptim[i]) if compiling: kFile = SourceModule(kFile, include_dirs=[myDir]) return kFile def getFreeMemory(show=True): ''' Return the free memory of the device,. Very usful to look for save device memory ''' Mb = 1024. * 1024. Mbytes = float(cuda.mem_get_info()[0]) / Mb if show: print("Free Global Memory: %f Mbytes" % Mbytes) return cuda.mem_get_info()[0] / Mb ctx, device = setDevice() devdata = DeviceData(device)
import pycuda.autoinit from pycuda.compiler import SourceModule from pycuda.tools import DeviceData dd = DeviceData() print(dd.shared_memory) print(dd.max_threads) print(dir(dd)) rows = 1000 cols = 1000 nagents = 10000 WALL_COLLISION_REWARD = -1.1 ROBOT_COLLISION_REWARD = -3 GOAL_REWARD = 2 view_size = 11 binwidth = 100 from math import ceil binr = ceil(rows / binwidth) binc = ceil(cols / binwidth) nbins = binr * binc def findNextPowerOf2(n): n = n - 1 while n & n - 1: n = n & n - 1 return n << 1