Exemplo n.º 1
0
    // Convert the linear index to subscripts:
    unsigned int a = idx/(${B}*${C});
    unsigned int b = (idx%(${B}*${C}))/${C};
    unsigned int c = (idx%(${B}*${C}))%${C};

    // Use the subscripts to access the array:
    if (idx < N) {
        if (b == 0)
           x[INDEX(a,b,c)] = 100;
    }
}
"""
)

max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device)
block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape)
max_blocks_per_grid = max(max_grid_dim)

func_mod = SourceModule(
    func_mod_template.substitute(
        max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C
    )
)
func = func_mod.get_function("func")
x_gpu = gpuarray.to_gpu(x)
func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim)
x_np = x.copy()
x_np[:, 0, :] = 100

print "Success status: ", np.allclose(x_np, x_gpu.get())
Exemplo n.º 2
0
                       blockIdx.x*${max_threads_per_block}+threadIdx.x;

    // Convert the linear index to subscripts:
    unsigned int a = idx/(${B}*${C});
    unsigned int b = (idx%(${B}*${C}))/${C};
    unsigned int c = (idx%(${B}*${C}))%${C};

    // Use the subscripts to access the array:
    if (idx < N) {
        if (b == 0)
           x[INDEX(a,b,c)] = 100;
    }
}
""")

max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(
    pycuda.autoinit.device)
block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device,
                                                   x.shape)
max_blocks_per_grid = max(max_grid_dim)

func_mod = \
         SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block,
                                                   max_blocks_per_grid=max_blocks_per_grid,
                                                   A=A, B=B, C=C))
func = func_mod.get_function('func')
x_gpu = gpuarray.to_gpu(x)
func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim)
x_np = x.copy()
x_np[:, 0, :] = 100

print 'Success status: ', np.allclose(x_np, x_gpu.get())
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np

from scikits.cuda.misc import get_dev_attrs, select_block_grid_sizes

# Device selected by PyCUDA:
dev = pycuda.autoinit.device

# Allocate input and output arrays:
a = np.asarray(np.random.rand(1000, 1000), np.float32)
b = np.empty_like(a)

# Determine device constraints and block/grid sizes:
max_threads_per_block, max_block_dim, max_grid_dim = \
                       get_dev_attrs(dev)
block_dim, grid_dim = select_block_grid_sizes(dev, a.shape)

# Perform element-wise operation on input matrix:
func_mod_template = Template("""
__global__ void func(${float} *a, ${float} *b, unsigned int N) {
    unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+
                       blockIdx.x*${max_threads_per_block}+threadIdx.x;

    if (idx < N)
        b[idx] = 5*a[idx];
}
""")

func_mod = SourceModule(func_mod_template.substitute(float='float',
                                                     max_threads_per_block=str(max_threads_per_block),