def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ calc_precision = None if config['CUDA_precision'] == 32: calc_precision = np.float32 elif config['CUDA_precision'] == 64: calc_precision = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from numbapro.cudalib.cublas import Blas # @UnresolvedImport from numbapro import cuda, float32 # @UnresolvedImport except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream) cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream) cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision) for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m, x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi) cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi) cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi) return cu_curr_phi.copy_to_host()
def induced_velocity3(x, xvort, gam, vel): # eps = float32(1.e-2) # i, j = cuda.grid(2) i = cuda.grid(1) if i < x.shape[0]: vel[i, 0] = float32(0.) vel[i, 1] = float32(0.) nvort = xvort.shape[0] for j in range(nvort): rsq = (x[i, 0] - xvort[j, 0])**2 + (x[i, 1] - xvort[j, 1])**2 + eps**2 vel[i, 0] += gam[j] * (x[i, 1] - xvort[j, 1]) / rsq vel[i, 1] += -gam[j] * (x[i, 0] - xvort[j, 0]) / rsq
from numbapro import cuda, vectorize, float32, void import numpy import time @cuda.jit(void(float32, float32[:], float32[:], float32[:])) def saxpy(a, x, y, out): i = cuda.grid(1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if i < out.size: out[i] = a*x[i] + y[i] @vectorize([float32(float32, float32, float32)], target='gpu') def vec_saxpy(a, x, y): return a*x + y n = 16*1024*1024 a = numpy.float32(2.0) x = numpy.arange(n, dtype='float32') y = numpy.arange(n, dtype='float32') out = numpy.empty_like(x) start_time = time.time() size_block = 1024 size_grid = int((n-1)/size_block + 1) saxpy[size_grid, size_block](a, x, y, out)
from numbapro import vectorize, float32 @vectorize([float32(float32, float32)], target='parallel') def sum(a, b): return a+b
i = cuda.grid(1) # Map i to array elements if i >= out.size: # Out of range? return # Do actual work out[i] = a * x[i] + y[i] """ Vectorize turns a scalar function into a elementwise operation over the input arrays. """ @vectorize([float32(float32, float32, float32)], target='gpu') def vec_saxpy(a, x, y): ### Task 1 ### # Complete the vectorize version # Hint: this is a scalar function of # float32(float32 a, float32 x, float32 y) return a * x + y # CPU code # --------- NUM_BLOCKS = 1 NUM_THREADS = 32 NELEM = NUM_BLOCKS * NUM_THREADS
def kern_CUDA_sparse(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, prog_bar=None): """`NVIDIA CUDA cuSPARSE <https://developer.nvidia.com/cusparse>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. Note: Currently some bug in :mod:`numbapro` introduces unnecessary array copies and slows down the execution tremendously. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ calc_precision = None if config['CUDA_precision'] == 32: calc_precision = np.float32 elif config['CUDA_precision'] == 64: calc_precision = np.float64 else: raise Exception("kern_CUDA_sparse(): Unknown precision specified.") print ("kern_CUDA_sparse(): Warning, the performance is slower than " + "dense cuBLAS or any type of MKL.") #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from numbapro.cudalib import cusparse # @UnresolvedImport from numbapro.cudalib.cublas import Blas from numbapro import cuda, float32 # @UnresolvedImport except ImportError: raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cusp = cusparse.Sparse() cubl = Blas() m, n = int_m.shape int_m_nnz = int_m.nnz int_m_csrValA = cuda.to_device(int_m.data.astype(calc_precision)) int_m_csrRowPtrA = cuda.to_device(int_m.indptr) int_m_csrColIndA = cuda.to_device(int_m.indices) dec_m_nnz = dec_m.nnz dec_m_csrValA = cuda.to_device(dec_m.data.astype(calc_precision)) dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr) dec_m_csrColIndA = cuda.to_device(dec_m.indices) cu_curr_phi = cuda.to_device(phi.astype(calc_precision)) cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision) descr = cusp.matdescr() descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO for step in xrange(nsteps): if prog_bar and (step % 5 == 0): prog_bar.update(step) cusp.csrmv(trans='T', m=m, n=n, nnz=int_m_nnz, descr=descr, alpha=float32(1.0), csrVal=int_m_csrValA, csrRowPtr=int_m_csrRowPtrA, csrColInd=int_m_csrColIndA, x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi) cusp.csrmv(trans='T', m=m, n=n, nnz=dec_m_nnz, descr=descr, alpha=float32(rho_inv[step]), csrVal=dec_m_csrValA, csrRowPtr=dec_m_csrRowPtrA, csrColInd=dec_m_csrColIndA, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi) cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi) return cu_curr_phi.copy_to_host()
''' Demonstrate broadcasting when a scalar is provided as an argument to a vectorize function. Please read NumPy Broadcasting documentation for details about broadcasting: http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html ''' from __future__ import print_function import numpy as np from numbapro import vectorize, float32 @vectorize([float32(float32, float32, float32)], target='parallel') def truncate(x, xmin, xmax): """ Truncate x[:] to [xmin, xmax] interval """ if x < xmin: x = xmin elif x > xmax: x = xmax return x def main(): x = np.arange(100, dtype=np.float32) print('x = %s' % x) xmin = np.float32(20) # as float32 type scalar xmax = np.float32(70) # as float32 type scalar # The scalar arguments are broadcasted into an array. # This process creates arrays of zero strides. # The resulting array will contain exactly one element despite it # has a shape that matches that of `x`.
''' Demonstrate broadcasting when a scalar is provided as an argument to a vectorize function. Please read NumPy Broadcasting documentation for details about broadcasting: http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html ''' from __future__ import print_function import numpy as np from numbapro import vectorize, float32 @vectorize([float32(float32, float32, float32)], target='parallel') def truncate(x, xmin, xmax): """ Truncate x[:] to [xmin, xmax] interval """ if x < xmin: x = xmin elif x > xmax: x = xmax return x def main(): x = np.arange(100, dtype=np.float32) print('x = %s' % x) xmin = np.float32(20) # as float32 type scalar xmax = np.float32(70) # as float32 type scalar # The scalar arguments are broadcasted into an array. # This process creates arrays of zero strides.