Exemplo n.º 1
0
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs, prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation 
    of forward-euler integration.
    
    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.
    
    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)` 
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """
    
    calc_precision = None
    if config['CUDA_precision'] == 32:
        calc_precision = np.float32
    elif config['CUDA_precision'] == 64:
        calc_precision = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")    
    
    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from numbapro.cudalib.cublas import Blas  # @UnresolvedImport
        from numbapro import cuda, float32  # @UnresolvedImport
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + 
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream)
    cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)
    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m,
            x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]),
            A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    return cu_curr_phi.copy_to_host()
Exemplo n.º 2
0
def induced_velocity3(x, xvort, gam, vel):
    # eps = float32(1.e-2)
    # i, j = cuda.grid(2)
    i = cuda.grid(1)
    if i < x.shape[0]:
        vel[i, 0] = float32(0.)
        vel[i, 1] = float32(0.)
        nvort = xvort.shape[0]
        for j in range(nvort):
            rsq = (x[i, 0] - xvort[j, 0])**2 + (x[i, 1] -
                                                xvort[j, 1])**2 + eps**2
            vel[i, 0] += gam[j] * (x[i, 1] - xvort[j, 1]) / rsq
            vel[i, 1] += -gam[j] * (x[i, 0] - xvort[j, 0]) / rsq
from numbapro import cuda, vectorize, float32, void
import numpy
import time

@cuda.jit(void(float32, float32[:], float32[:], float32[:]))
def saxpy(a, x, y, out):
	
	i = cuda.grid(1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
	
	if i < out.size:
		out[i] = a*x[i] + y[i]


@vectorize([float32(float32, float32, float32)], target='gpu')
def vec_saxpy(a, x, y):
	return a*x + y


n = 16*1024*1024

a = numpy.float32(2.0)
x = numpy.arange(n, dtype='float32')
y = numpy.arange(n, dtype='float32')
out = numpy.empty_like(x)

start_time = time.time()

size_block = 1024
size_grid = int((n-1)/size_block + 1)
saxpy[size_grid, size_block](a, x, y, out)
Exemplo n.º 4
0
from numbapro import vectorize, float32

@vectorize([float32(float32, float32)],
    target='parallel')
def sum(a, b):
    return a+b
Exemplo n.º 5
0
    i = cuda.grid(1)
    # Map i to array elements
    if i >= out.size:
        # Out of range?
        return
    # Do actual work
    out[i] = a * x[i] + y[i]


"""
Vectorize turns a scalar function into a
elementwise operation over the input arrays.
"""


@vectorize([float32(float32, float32, float32)], target='gpu')
def vec_saxpy(a, x, y):
    ### Task 1 ###
    # Complete the vectorize version
    # Hint: this is a scalar function of
    # 		float32(float32 a, float32 x, float32 y)
    return a * x + y


# CPU code
# ---------

NUM_BLOCKS = 1
NUM_THREADS = 32
NELEM = NUM_BLOCKS * NUM_THREADS
Exemplo n.º 6
0
def kern_CUDA_sparse(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs, prog_bar=None):
    """`NVIDIA CUDA cuSPARSE <https://developer.nvidia.com/cusparse>`_ implementation 
    of forward-euler integration.
    
    Function requires a working :mod:`numbapro` installation.
    
    Note:
      Currently some bug in :mod:`numbapro` introduces unnecessary array copies and
      slows down the execution tremendously. 
    
    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)` 
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """
    calc_precision = None
    if config['CUDA_precision'] == 32:
        calc_precision = np.float32
    elif config['CUDA_precision'] == 64:
        calc_precision = np.float64
    else:
        raise Exception("kern_CUDA_sparse(): Unknown precision specified.")    
    print ("kern_CUDA_sparse(): Warning, the performance is slower than " + 
           "dense cuBLAS or any type of MKL.")
    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from numbapro.cudalib import cusparse  # @UnresolvedImport
        from numbapro.cudalib.cublas import Blas
        from numbapro import cuda, float32  # @UnresolvedImport
    except ImportError:
        raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + 
                        "installed.\nCan not use GPU.")
    cusp = cusparse.Sparse()
    cubl = Blas()
    m, n = int_m.shape
    int_m_nnz = int_m.nnz
    int_m_csrValA = cuda.to_device(int_m.data.astype(calc_precision))
    int_m_csrRowPtrA = cuda.to_device(int_m.indptr)
    int_m_csrColIndA = cuda.to_device(int_m.indices)
    
    dec_m_nnz = dec_m.nnz
    dec_m_csrValA = cuda.to_device(dec_m.data.astype(calc_precision))
    dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr)
    dec_m_csrColIndA = cuda.to_device(dec_m.indices)
    
    cu_curr_phi = cuda.to_device(phi.astype(calc_precision))
    cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)

    descr = cusp.matdescr()
    descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO
    
    for step in xrange(nsteps):
        if prog_bar and (step % 5 == 0):
            prog_bar.update(step)
        cusp.csrmv(trans='T', m=m, n=n, nnz=int_m_nnz,
                   descr=descr,
                   alpha=float32(1.0),
                   csrVal=int_m_csrValA,
                   csrRowPtr=int_m_csrRowPtrA,
                   csrColInd=int_m_csrColIndA,
                   x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
        cusp.csrmv(trans='T', m=m, n=n, nnz=dec_m_nnz,
                   descr=descr,
                   alpha=float32(rho_inv[step]),
                   csrVal=dec_m_csrValA,
                   csrRowPtr=dec_m_csrRowPtrA,
                   csrColInd=dec_m_csrColIndA,
                   x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    return cu_curr_phi.copy_to_host()
'''
Demonstrate broadcasting when a scalar is provided as an argument to a 
vectorize function.

Please read NumPy Broadcasting documentation for details about broadcasting:
http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html
'''

from __future__ import print_function
import numpy as np
from numbapro import vectorize, float32

@vectorize([float32(float32, float32, float32)], target='parallel')
def truncate(x, xmin, xmax):
    """ Truncate x[:] to [xmin, xmax] interval """
    if x < xmin:
        x = xmin
    elif x > xmax:
        x = xmax
    return x

def main():
    x = np.arange(100, dtype=np.float32)
    print('x = %s' % x)
    xmin = np.float32(20)  # as float32 type scalar
    xmax = np.float32(70)  # as float32 type scalar

    # The scalar arguments are broadcasted into an array.
    # This process creates arrays of zero strides.
    # The resulting array will contain exactly one element despite it
    # has a shape that matches that of `x`.
Exemplo n.º 8
0
'''
Demonstrate broadcasting when a scalar is provided as an argument to a 
vectorize function.

Please read NumPy Broadcasting documentation for details about broadcasting:
http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html
'''

from __future__ import print_function
import numpy as np
from numbapro import vectorize, float32


@vectorize([float32(float32, float32, float32)], target='parallel')
def truncate(x, xmin, xmax):
    """ Truncate x[:] to [xmin, xmax] interval """
    if x < xmin:
        x = xmin
    elif x > xmax:
        x = xmax
    return x


def main():
    x = np.arange(100, dtype=np.float32)
    print('x = %s' % x)
    xmin = np.float32(20)  # as float32 type scalar
    xmax = np.float32(70)  # as float32 type scalar

    # The scalar arguments are broadcasted into an array.
    # This process creates arrays of zero strides.