Python Out示例，pycuda.driver.Out Python示例

示例#1

0

显示文件

    def forward1d(sig, fourier_pts, eps=None):
        # converting all variables to complex one
        sig = sig.astype(np.complex64)
        fourier_pts = fourier_pts.astype(np.complex64)
        # grid = np.arange(np.ceil(-len(sig) / 2.), np.ceil(len(sig) / 2.)).astype(
        #     np.complex64)

        sz = np.uint32(sig.size)
        res = np.zeros_like(sig).astype(np.complex64)
        grid = np.zeros_like(sig).astype(np.complex64)

        # the kernel
        mod = SourceModule("""
        #include <pycuda-complex.hpp>
        #include <stdio.h>
        __global__ void dft1(pycuda::complex<float> *signal, pycuda::complex<float> *fourier_pts, pycuda::complex<float> *grid,int sz, pycuda::complex<float> *res)
        {
            // initialize grid
            int i = 0;
            for (i=0; i<sz; i++){
                grid[i] = i - (sz/2);
            }
            
            int idx = (threadIdx.x + blockDim.x * blockIdx.x) + 
            (threadIdx.y + blockDim.y * blockIdx.y) + 
            (threadIdx.z + blockDim.z * blockIdx.z);
            
            pycuda::complex<float> j(0, -1);
            pycuda::complex<float> tmp(0, 0);
    
            for (int i=0; i<sz; i++){
                tmp += exp(j * grid[i] * fourier_pts[idx]) * signal[i];
            }
            res[idx] = tmp;
        }
        """)

        bdim = (32, 32, 1)

        gridm = (sz / bdim[0] + (sz % bdim[0] > 0), 1)

        func = mod.get_function("dft1")
        func(cuda.In(sig),
             cuda.In(fourier_pts),
             cuda.In(grid),
             sz,
             cuda.Out(res),
             block=bdim,
             grid=gridm)
        return res, 0

示例#2

0

显示文件

def gpu_curand_init(seed):
    global _rng_state
    global _NUM_THREADS

    if _rng_state is not None:
        gpu_curand_deinit()

    seed = np.int32(seed)
    _rng_state = np.zeros(_NUM_THREADS, np.intp)

    _gpu_curand_init(seed,
                     cuda.Out(_rng_state),
                     block=(_NUM_THREADS, 1, 1),
                     grid=(1, 1))

示例#3

0

显示文件

 def run(self, x):
     dev_x = drv.to_device(x)
     sell_y = np.empty(self.slice_height * self.slice_count,
                       dtype=np.float32)
     self.prg(self.dev_sell_sliceptr,
              self.dev_sell_slicecol,
              self.dev_sell_colidx,
              self.dev_sell_val,
              dev_x,
              np.int32(self.slice_height),
              drv.Out(sell_y),
              block=(self.slice_height, 1, 1),
              grid=(self.slice_count, 1))
     return sell_y[:self.n_row]

示例#4

0

显示文件

文件： thread_SMBA_test.py 项目： wardsimon/spinwaves

    def kernel(self):
        density, x, y, z, Qx, Qy, Qz, result = self.born_args
        nx, ny, nz, nqx, nqy, nqz = [
            numpy.int32(len(v)) for v in x, y, z, Qx, Qy, Qz
        ]
        cx, cy, cz, cQx, cQy, cQz, cdensity = [
            gpuarray.to_gpu(v) for v in x, y, z, Qx, Qy, Qz, density
        ]

        cframe = cuda.mem_alloc(result[0].nbytes)
        cuda.matrix_to_texref(nx, texref, order="C")
        texref.set_filter_mode(cuda.filter_mode.LINEAR)
        n = int(1 * nqy * nqz)
        print 'fn in kernel'
        while True:
            try:
                qxi = numpy.int32(self.work_queue.get(block=False))
            except Queue.Empty:
                break

            print "%d of %d on %d\n" % (qxi, nqx, self.gpu),
            cuda_texture_func(nx,
                              ny,
                              nz,
                              nqx,
                              nqy,
                              nqz,
                              cdensity,
                              cx,
                              cy,
                              cz,
                              cQx,
                              cQy,
                              cQz,
                              qxi,
                              cuda.Out(result),
                              texrefs=[texref])

            #self.cudaBorn(nx,ny,nz,nqx,nqy,nqz,
            #cdensity,cx,cy,cz,cQx,cQy,cQz,qxi,cframe,
            #**cuda_partition(n))
            ## Delay fetching result until the kernel is complete
            #cuda_sync()

            ## Fetch result back to the CPU
            #cuda.memcpy_dtoh(result[qxi], cframe)

            print "%d %s\n" % (qxi, ctemp.get())
        del cx, cy, cz, cQx, cQy, cQz, cdensity, cframe

示例#5

0

显示文件

文件： game_of_life.py 项目： hata-m/cuda_game_of_life

def calc_next_world_gpu(world, next_world):
    height, width = world.shape
    ## CUDAカーネルを定義
    mod = SourceModule("""
    __global__ void get_next_world(int *world, int *nextWorld, int height, int width){
        int x = threadIdx.x + blockIdx.x * blockDim.x;
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        
        const int index = y * width + x;
        int current_value;
        int next_value;
        if (x >= width) {
            return;
        }
        if (y >= height) {
            return;
        }
        current_value = world[index];

        int numlive = 0;    
        numlive += world[((y - 1) % height ) * width + ((x - 1) % width)];
        numlive += world[((y - 1) % height ) * width + ( x      % width)]; 
        numlive += world[((y - 1) % height ) * width + ((x + 1) % width)]; 
        numlive += world[( y      % height ) * width + ((x - 1) % width)];
        numlive += world[( y      % height ) * width + ((x + 1) % width)];
        numlive += world[((y + 1) % height ) * width + ((x - 1) % width)];
        numlive += world[((y + 1) % height ) * width + ( x      % width)]; 
        numlive += world[((y + 1) % height ) * width + ((x + 1) % width)];

        if (current_value == 0 && numlive == 3){
            next_value = 1;
        }else if (current_value == 1 && numlive >= 2 && numlive <= 3){
            next_value = 1;
        }else{
            next_value = 0;
        }
        nextWorld[index] = next_value; 
    }
    """)
    set_next_cell_value_GPU = mod.get_function("get_next_world")
    block = (BLOCKSIZE, BLOCKSIZE, 1)
    grid = ((width + block[0] - 1) // block[0],
            (height + block[1] - 1) // block[1])
    set_next_cell_value_GPU(cuda.In(world),
                            cuda.Out(next_world),
                            numpy.int32(height),
                            numpy.int32(width),
                            block=block,
                            grid=grid)

示例#6

0

显示文件

文件： myCudaModule.py 项目： adwaye/MS

def onestepIteration(dist, timestep, maxit):
    """
    iterates the function image on a 2d grid through an euler anisotropic
    diffusion operator with timestep=timestep maxit number of times
    """
    image = 1 * dist
    forme = image.shape
    if (np.size(forme) > 2):
        sys.exit('Only works on gray images')

    aSize = forme[0] * forme[1]
    xdim = np.int32(forme[0])
    ydim = np.int32(forme[1])

    image[0, :] = image[1, :]
    image[xdim - 1, :] = image[xdim - 2, :]
    image[:, ydim - 1] = image[:, ydim - 2]
    image[:, 0] = image[:, 1]

    image = image.reshape(aSize, order='C').astype(np.float32)
    final = np.zeros(aSize).astype(np.float32)

    #reshaping the image matrix

    #block size: B := dim1*dim2*dim3=1024
    #gird size : dim1*dimr2*dim3 = ceiling(aSize/B)
    blockX = int(1024)
    multiplier = aSize / float(1024)
    if (aSize / float(1024) > int(aSize / float(1024))):
        gridX = int(multiplier + 1)
    else:
        gridX = int(multiplier)

    for k in range(0, maxit):
        diffIteration(drv.In(image),
                      drv.Out(final),
                      ydim,
                      xdim,
                      np.float32(timestep),
                      block=(blockX, 1, 1),
                      grid=(gridX, 1, 1))
        final = final.reshape(forme, order='C')
        final[0, :] = final[1, :]
        final[xdim - 1, :] = final[xdim - 2, :]
        final[:, ydim - 1] = final[:, ydim - 2]
        final[:, 0] = final[:, 1]
        image = final.reshape(aSize, order='C').astype(np.float32)

    return final.reshape(forme, order='C')

示例#7

0

显示文件

文件： display.py 项目： ferryastika/feature-extraction-pycuda

def hitungcuda(a5):
    # cuda.init()
    # device = cuda.Device(0)
    # ctx = device.make_context()
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
    __global__ void conv5(float *r5r, float *r5i, float *a5, float *f5r, float *f5i)
    {
        const int i = blockDim.x * blockIdx.x + threadIdx.x;
        const int j = blockDim.y * blockIdx.y + threadIdx.y;
        int Idx = i + j * blockDim.x * gridDim.x;
        r5r[Idx] = a5[Idx] * f5r[Idx];
        r5i[Idx] = a5[Idx] * f5i[Idx];
    }
    """)
    # ctx.pop()
    conv5 = mod.get_function("conv5")
    conv5(cuda.Out(r5r),
          cuda.Out(r5i),
          cuda.In(a5),
          cuda.In(f5r),
          cuda.In(f5i),
          block=(68, 4, 1),
          grid=(5, 5))

示例#8

0

显示文件

    def matmul(self, M, N, timed=False):
        """
        Apply the compiled kernel to compute M*N and return the result. 
        Optionally, report GPU execution time (in miliseconds) if timed is set 
        to True.

        M, N and P are all assumed to be square matrices of the same dimension 
        and whose dtype is compatible with that of the kernel.
        """
        n = np.shape(M)[0]
        P = np.empty((n, n), dtype=self.dtype)  # initialize empty P
        block_dim = (self.tile_width, self.tile_width, 1)
        grid_width = int(np.ceil(n / float(self.tile_width)))
        grid_dim = (grid_width, grid_width, 1)
        if timed:
            start = cuda.Event()
            end = cuda.Event()
            start.record()
            self.kernelfunc(cuda.In(M),
                            cuda.In(N),
                            cuda.Out(P),
                            np.int32(n),
                            block=block_dim,
                            grid=grid_dim)
            end.record()
            end.synchronize()
            milisecs = start.time_till(end)
            return P, milisecs
        else:
            self.kernelfunc(cuda.In(M),
                            cuda.In(N),
                            cuda.Out(P),
                            np.int32(n),
                            block=block_dim,
                            grid=grid_dim)
            return P

示例#9

0

显示文件

def calculate_rates(lat: np.ndarray, lon: np.ndarray, pop: np.ndarray,
                    parameters: dict) -> tuple:

    cuda.init()
    device = cuda.Device(0)
    context = device.make_context()
    kernel_fn = get_kernel()

    p0 = float(parameters["p0"]) if "p0" in parameters else 1.0
    p1 = float(parameters["p1"]) if "p1" in parameters else 1.0
    p2 = float(parameters["p2"]) if "p2" in parameters else 1.0
    p3 = float(parameters["p3"]) if "p3" in parameters else -2.0

    count = len(lat)
    distances = np.zeros((count, count), dtype=np.float32)
    rates = np.zeros((count, count), dtype=np.float32)

    BLOCK_DIM = 32
    GRID_DIM = (count + BLOCK_DIM - 1) // BLOCK_DIM
    try:
        kernel_fn(np.uint32(count),
                  cuda.In(lat),
                  cuda.In(lon),
                  cuda.In(pop),
                  np.float32(p0),
                  np.float32(p1),
                  np.float32(p2),
                  np.float32(p3),
                  cuda.Out(distances),
                  cuda.Out(rates),
                  block=(BLOCK_DIM, BLOCK_DIM, 1),
                  grid=(GRID_DIM, GRID_DIM))
    finally:
        context.pop()

    return distances, rates

示例#10

0

显示文件

def loopboxes_overlaps(vertex, gt_vertex, sample_range, ignore_union):
    K, dim = vertex.shape
    N, dim2 = gt_vertex.shape
    assert dim >= 8
    assert dim2 >=8
    vertex = vertex[:, :8]
    gt_vertex = gt_vertex[:, :8]
    overlaps = np.zeros((K, N), dtype=np.float32)

    with cuda_context():
        cuda_overlap(cuda.In(vertex.astype(np.float32)),cuda.In(gt_vertex.astype(np.float32)),
                cuda.Out(overlaps), np.int32(K), np.int32(N), np.int32(8),
                np.float32(sample_range), np.int32(ignore_union),
                block=(THREAD_NUM,1,1),grid=(K/THREAD_NUM+1,N,1))
    return overlaps.astype(np.float)

示例#11

0

显示文件

    def test_vector_types(self):
        mod = SourceModule("""
        __global__ void set_them(float3 *dest, float3 x)
        {
          const int i = threadIdx.x;
          dest[i] = x;
        }
        """)

        set_them = mod.get_function("set_them")
        a = gpuarray.vec.make_float3(1, 2, 3)
        dest = np.empty((400), gpuarray.vec.float3)

        set_them(drv.Out(dest), a, block=(400, 1, 1))
        assert (dest == a).all()

示例#12

0

显示文件

def gpuAdd(array1, array2):
    if array1.dtype != np.float32 or array2.dtype != np.float32:
        array1 = array1.astype(np.float32)
        array2 = array2.astype(np.float32)
    mod = SourceModule("""
    __global__ void add_them(float *dest, float *a, float *b)
    {
      const int i = threadIdx.x;
      dest[i] = a[i] + b[i];
    }
    """)
    add_them = mod.get_function("add_them")
    dest = np.zeros_like(array1)
    add_them(drv.Out(dest), drv.In(array1), drv.In(array2), block=(32, 32, 1))
    return dest

示例#13

0

显示文件

def euclidean_dists(a, bs, n):
    # bzero the dest array
    dest = np.zeros((n, )).astype(np.float64)

    # elements a[0], a[1]
    # prevents an error about ndarray continuity
    a_point = a[0:2].copy(order='C')

    parallel_euclidean_dist(drv.Out(dest),
                            drv.In(a_point),
                            drv.In(bs),
                            block=(n, 1, 1),
                            grid=(1, 1))

    return dest

示例#14

0

显示文件

def get_next_state_gpu(state, next_state):
    height, width = state.shape

    mod = SourceModule("""
        __global__ void get_next_state(int *state, int *nextState, int height, int width)
        {
            unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
            unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
            unsigned int idx = iy * width + ix;
            int sum = 0;
            int val, nextVal;

            if (ix >= width || iy >= height) {
                return;
            }

            val = state[idx];

            sum += state[((iy - 1) % height) * width + ((ix - 1) % width)];
            sum += state[((iy - 1) % height) * width + (ix       % width)];
            sum += state[((iy - 1) % height) * width + ((ix + 1) % width)];
            sum += state[(iy       % height) * width + ((ix - 1) % width)];
            sum += state[(iy       % height) * width + ((ix + 1) % width)];
            sum += state[((iy + 1) % height) * width + ((ix - 1) % width)];
            sum += state[((iy + 1) % height) * width + (ix       % width)];
            sum += state[((iy + 1) % height) * width + ((ix + 1) % width)];

            if (val == 0 && sum == 3) {
                nextVal = 1;
            }
            else if (val != 0 && (sum >= 2 && sum <= 3)) {
                nextVal = 1;
            }
            else {
                nextVal = 0;
            }
            nextState[idx] = nextVal;
        }
        """)
    kernel_func = mod.get_function("get_next_state")

    blk_dim = (32, 32, 1)
    grid_dim = ((width + blk_dim[0] - 1) // blk_dim[0], \
        (height + blk_dim[1] - 1) // blk_dim[1], 1)
    kernel_func(
        drv.In(state), drv.Out(next_state), \
        np.int32(height), np.int32(width), \
        block=blk_dim, grid=grid_dim)

示例#15

0

显示文件

def lerpImage(imageA, imageB, mu, outfile):
    imA = Image.open(imageA)
    imB = Image.open(imageB)

    # read pixels and floats
    pxA = np.array(imA)
    pxA = pxA.astype(np.float32)
    pxB = np.array(imB)
    pxB = pxB.astype(np.float32)

    # the kernel function
    kernel = """
    __global__ void lerpImage(float *lerped, float *a, float *b, float mu, int check){
        int i = (threadIdx.x) + blockDim.x * blockIdx.x;
        if(i*3 < check*3) {
            lerped[i*3]= a[i*3] + mu * (b[i*3]-a[i*3]);
            lerped[i*3+1]= a[i*3+1] + mu * (b[i*3+1]-a[i*3+1]);
            lerped[i*3+2]= a[i*3+2] + mu * (b[i*3+2]-a[i*3+2]);
        }
    }
    """

    # define block and grid
    dim = imA.size[0] * imA.size[1]
    checkSize = np.int32(dim)
    BLOCK_SIZE = 1024
    block = (BLOCK_SIZE, 1, 1)
    grid = (int(dim / BLOCK_SIZE) + 1, 1, 1)

    # Init lerped pixels
    lerpedPx = np.zeros_like(pxA)

    # Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("lerpImage")
    func(cuda.Out(lerpedPx),
         cuda.In(pxA),
         cuda.In(pxB),
         np.float32(mu),
         checkSize,
         block=block,
         grid=grid)

    # Convert back to ints and save
    lerpedPx = (np.uint8(lerpedPx))
    imOut = Image.fromarray(lerpedPx, mode="RGB")
    imOut.save(outfile)
    print "Wrote to image file %s" % outfile

示例#16

0

显示文件

def compute(volume, offset):
    bsize = (32, 32, 1)
    gsize = (int(volume[0] / bsize[0]), int(volume[1] / bsize[1]),
             int(volume[2]))

    DEFINES = '\n#define SCALE ' + str(1) + \
     '\n#define WIDTH ' + str(volume[0]) + \
     '\n#define HEIGHT ' + str(volume[1]) + \
     '\n#define bwidth ' + str(bsize[0]) + \
     '\n#define bheight ' + str(bsize[1]) + \
     '\n#define offx ' + str(-offset[0]) + \
     '\n#define offy ' + str(-offset[1]) + \
     '\n#define offz ' + str(-offset[2]) + \
     '\n#define DEPTH ' + str(volume[2]) + \
     '\n#define bdepth ' + str(1) + '\n' # inutile

    # Non optimal method
    path = os.path.split(__file__)[0] + '/cuda/'

    kernel_cu = open(path + 'kernel.cu', 'r')
    kernel_buf = kernel_cu.read()

    # Load complex2.cu
    complex2_cu = open(path + 'complex2.cu', 'r')
    complex2_buf = complex2_cu.read()

    # Load vectors.cu
    vectors_cu = open(path + 'vectors.cu', 'r')
    vectors_buf = vectors_cu.read()

    # Import cu files inside the kernel and copy the defines
    cu_buffer = vectors_buf + '\n' + complex2_buf
    kernel_buf = kernel_buf.replace('%DEFINES%',
                                    DEFINES).replace('%CUFILES%', cu_buffer)

    mod = SourceModule(kernel_buf,
                       "nvcc",
                       include_dirs=["/usr/local/cuda/include"],
                       no_extern_c=True)
    compute = mod.get_function("compute")

    # Array di uscita
    dest = np.zeros(volume[0] * volume[1] * volume[2]).astype(np.float32)

    compute(drv.Out(dest), block=bsize, grid=gsize)
    context.synchronize()

    return dest

示例#17

0

显示文件

def nearest_neighbor(dataset):
    a, b = dataset
    n = b.size / len(b)
    as_xy = np.delete(a, 2, 0)
    bs_xy = np.delete(b, 2, 0)

    dest = np.zeros((n, )).astype(np.int32)

    parallel_nn(drv.Out(dest),
                drv.In(as_xy),
                drv.In(bs_xy),
                np.int32(n),
                block=(n, 1, 1),
                grid=(1, 1))

    return dest

示例#18

0

显示文件

def cuda_add():
    multiply_them = mod.get_function("multiply_them")

    a = numpy.random.randn(10).astype(numpy.float32)
    b = numpy.random.randn(10).astype(numpy.float32)
    print(a)

    dest = numpy.zeros_like(a)
    c = 2
    multiply_them(cuda.Out(dest),
                  cuda.InOut(a),
                  cuda.In(b),
                  numpy.float64(c),
                  block=(10, 1, 1),
                  grid=(1, 1))
    return a

示例#19

0

显示文件

def overlap_coord_transfer(gt_boxes):
    if gt_boxes is None:
        return np.array([])
    if len(gt_boxes)<=0:
        return gt_boxes
    K, n = gt_boxes.shape
    if n >= 8:
        return gt_boxes[:,:8]
    vertex = np.ones((K, 9), dtype = np.float32)*(-1.0)

    with cuda_context():
        cuda_transfer(cuda.In(gt_boxes.astype(np.float32)),
                      cuda.Out(vertex),
                      np.int32(n),np.int32(K),
                      block=(THREAD_NUM,1,1),grid=(K/THREAD_NUM+1,1))
    return vertex.astype(np.float)

示例#20

0

显示文件

文件： dipolar.py 项目： chrisjbillington/dipolar_gas

def h_of_p( px, py, E_k_n, U_k, mu, q, g, theta, debug=0):
    kxprime = q * reduced_kx
    kyprime = q * reduced_ky
    h_p = np.zeros((N_kx, N_ky))

    block=(16,16,1)
    grid=(int(N_kx/16 + 1),int(N_ky/16 + 1))

    h_of_p_GPU(drv.Out(h_p),
               drv.In(px), drv.In(py),
               drv.In(kxprime), drv.In(kyprime),
               drv.In(E_k_n), drv.In(U_k),
               np.double(mu), np.double(q), np.double(g), np.double(theta),
               np.int32(N_kx), np.int32(N_ky), np.int32(debug),
               block=block, grid=grid)
    return h_p

示例#21

0

显示文件

    def computeCorrespondence(self):
        """
        Compute point correspondence from result PointCloud to dst.
        CUDA function and summation reduction is called here.

        :return: total distance and matrix with point correspondence
        """
        super(ICPParallel, self).computeCorrespondence()

        target = np.zeros([self.src.num, 3], dtype=np.float32)
        self.computeCorrespondenceCuda(cuda.In(self.result.points),
                                       cuda.Out(target),
                                       self.distances_gpu,
                                       block=(self.numCore, 1, 1))

        return gpuarray.sum(self.distances_gpu).get(), PointCloud(target)

示例#22

0

显示文件

文件： equalizer.py 项目： vdrg/t2concurrente

def hsv2rgb(img):
    hsv2rgb_func = mod.get_function("hsv_rgb")
    width = img.shape[1]
    height = img.shape[0]
    block = (16, 16, 1)
    grid = (ceil(width / 16), ceil(height / 16), 1)

    result = np.empty_like(img).astype(np.uint8)
    hsv2rgb_func(cuda.In(img.astype(np.float32)),
                 cuda.Out(result),
                 np.int32(width),
                 np.int32(height),
                 grid=grid,
                 block=block)

    return result

示例#23

0

显示文件

 def _process_total_test(self):
     """Use PyCuda"""
     nElements = np.int32(BLOCK_SIZE * 16 + 10)
     nBlocks = nElements / BLOCK_SIZE + 1
     grid_dimensions = (nBlocks, 1, 1)
     a = np.random.randn(nElements).astype(np.float32)
     sum_cpu = np.sum(a)
     partialsum_gpu = np.zeros((nBlocks, 1), dtype=np.float32)
     self.cuda_total(cuda_driver.In(a), cuda_driver.Out(partialsum_gpu), \
      np.uint32(nElements), grid=grid_dimensions, block=(BLOCK_SIZE, 1, 1))
     cuda_driver.Context.synchronize()
     #Sum result from GPU
     print nBlocks
     print partialsum_gpu
     sum_gpu = np.sum(partialsum_gpu[0:np.ceil(nBlocks / 2.)])
     return sum_cpu, sum_gpu

示例#24

0

显示文件

文件： sparkcudatestmap.py 项目： angeliababy/spark_learn

def gpufunc(xdr_data):
    xdr_data = iter(xdr_data)
    inp = np.asarray(list(xdr_data), dtype=np.float32)
    N = len(inp)
    # print("len:",N)
    out = np.zeros(N, dtype=np.float32)
    # out = np.empty(N, gpuarray.vec.float1)

    N = np.int32(N)
    print(inp, out)
    print("B")
    # GPU run
    nTheads = 256 * 4
    nBlocks = int((N + nTheads - 1) / nTheads)
    drv.init()
    dev = drv.Device(0)
    contx = dev.make_context()
    mod = SourceModule("""
__global__ void func(float *a, float *b, size_t N)
{
  const int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i >= N)
  {
    return;
  }
  //float temp_a = a[i];
  //float temp_b = b[i];
  //a[i] = (temp_a * 10 + 2 ) * ((temp_b + 2) * 10 - 5 ) * 5;
  a[i] = b[i];
}
""")

    func = mod.get_function("func")

    start = timer()
    func(drv.Out(out),
         drv.In(inp),
         N,
         block=(nTheads, 1, 1),
         grid=(nBlocks, 1))
    out1 = [np.asarray(x) for x in out]
    contx.pop()
    del contx
    del inp
    run_time = timer() - start
    print("gpu run time %f seconds " % run_time)
    return iter(out1)

示例#25

0

显示文件

文件： mandelbrot2.py 项目： Spaceunit/NLA

def zoom_on_square(eclick, erelease):
    """eclick and erelease are the press and release events"""
    global n, side, x0, y0, my_obj, M, power
    x1, y1 = min(eclick.xdata, erelease.xdata), min(eclick.ydata,
                                                    erelease.ydata)
    x2, y2 = max(eclick.xdata, erelease.xdata), max(eclick.ydata,
                                                    erelease.ydata)
    print(" The button you used were: %s %s" %
          (eclick.button, erelease.button))
    print(' Nx=%d, Ny=%d, x0=%f, y0=%f' % (x1, y1, x0, y0))
    print(' Nx=%d, Ny=%d, x0=%f, y0=%f' % (x2, y2, x0, y0))
    x_1 = x0 + side * (x1 - n / 2.) / n
    y_1 = y0 + side * (y1 - n / 2.) / n
    x_2 = x0 + side * (x2 - n / 2.) / n
    y_2 = y0 + side * (y2 - n / 2.) / n
    x0 = (x_2 + x_1) / 2.
    y0 = (y_2 + y_1) / 2.

    # Average of the 2 rectangle sides
    side = side * (x2 - x1 + y2 - y1) / n / 2
    mandel(np.float64(x0),
           np.float64(y0),
           np.float64(side),
           np.int32(loops),
           np.int32(power),
           drv.Out(M),
           block=(n_block, n_block, 1),
           grid=(n_grid, n_grid, 1))
    my_obj = plt.imshow(
        M,
        origin='lower',
        cmap=cmaps[i_cmap],
        aspect='equal',
    )
    my_obj.set_data(M)
    ax.add_patch(
        Rectangle(
            (1 - .1, 1 - .1),
            0.2,
            0.2,
            alpha=1,
            facecolor='none',
            fill=None,
        ))
    ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, Loops=%d' %
                 (side, x0, y0, cmaps[i_cmap], loops))
    plt.draw()

示例#26

0

显示文件

def generate_hat(num_samples):
    # The math suggests 16 samples is the width of the QRS complex
    # Measuring the QRS complex for 9004 gives 16 samples
    # Measured correlated peak 7 samples after start of QRS
    # Mexican hats seem to hold a nonzero value between -4 and 4 w/ sigma=1
    sigma = 1.0
    maxval = 4 * sigma
    minval = -maxval

    hat = numpy.zeros(num_samples).astype(numpy.float32)
    mexican_hat(cuda.Out(hat),
                numpy.float32(sigma),
                numpy.float32(minval),
                numpy.float32((maxval - minval) / num_samples),
                grid=(1, 1),
                block=(num_samples, 1, 1))
    return hat

示例#27

0

显示文件

    def test_simple_kernel(self):
        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        a = np.random.randn(400).astype(np.float32)
        b = np.random.randn(400).astype(np.float32)

        dest = np.zeros_like(a)
        multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1))
        assert la.norm(dest - a * b) == 0

示例#28

0

显示文件

    def cuda_interpolate(self, channel, m, size_result):
        cols = size_result[0]
        rows = size_result[1]

        kernel_code = """
		texture<float, 2> tex;
	
		__global__ void interpolation(float *dest, float *m0, float *m1)
		{
			int idx = threadIdx.x + blockDim.x * blockIdx.x;
			int idy = threadIdx.y + blockDim.y * blockIdx.y;
	
			if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) {
				dest[%(NDIM)s * idx + idy] = tex2D(tex, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
			}
		}
		"""

        kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows}
        mod = SourceModule(kernel_code)

        interpolation = mod.get_function("interpolation")
        texref = mod.get_texref("tex")

        channel = channel.astype("float32")
        drv.matrix_to_texref(channel, texref, order="F")
        texref.set_filter_mode(drv.filter_mode.LINEAR)

        bdim = (16, 16, 1)
        dx, mx = divmod(cols, bdim[0])
        dy, my = divmod(rows, bdim[1])

        gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1])

        dest = np.zeros((rows, cols)).astype("float32")
        m0 = (m[0, :] - 1).astype("float32")
        m1 = (m[1, :] - 1).astype("float32")

        interpolation(drv.Out(dest),
                      drv.In(m0),
                      drv.In(m1),
                      block=bdim,
                      grid=gdim,
                      texrefs=[texref])

        return dest.astype("uint8")

示例#29

0

显示文件

文件： magjoint_poseestimator.py 项目： Roboy/ball_in_socket_estimator

 def minimizeFunc(self, x):
     r = R.from_euler('xyz', x, degrees=True)
     for (select, i) in zip(self.selection, range(len(self.selection))):
         pos = self.sensor_pos[select]
         self.input[i] = r.apply(pos)
     self.interpol(np.int32(len(self.selection)),
                   drv.In(self.input),
                   drv.Out(self.output),
                   texrefs=[self.texref],
                   block=self.bdim,
                   grid=self.gdim)
     b_error = 0
     for i in range(len(self.selection)):
         out = self.output[i]
         target = r.apply(self.b_target[i])
         b_error += np.linalg.norm(out - target)
     return [b_error]

示例#30

0

显示文件

文件： Final_QuadTempo.py 项目： bshong2850/QuadTree_2D_Smoke_Super_resolution

        def Max(input_data, input_N):
            input = input_data
            block_N = 2
            grid_N = int(input_N / block_N)
            output = np.zeros((input_N // 2, input_N // 2), np.float32)
            reduction_max(drv.In(input),
                          np.int32(input_N),
                          drv.Out(output),
                          block=(block_N, block_N, 1),
                          grid=(grid_N, grid_N))
            if (output.shape[0] == data_shape_size):
                return output
            input = output
            input_N = input.shape[0]
            result = Max(input, input_N)

            return result