Пример #1
0
    def __init__(self, vol_bnds, voxel_size):

        # Define voxel volume parameters.
        self._vol_bnds = vol_bnds  # 3x2, rows: (x, y, z), columns: (min, max) in world coordinates in meters
        self._voxel_size = voxel_size  # in meters (determines volume discretization and resolution)
        self._trunc_margin = self._voxel_size * 5  # truncation on SDF

        # Adjust volume bounds.
        self._vol_dim = np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) /
                                self._voxel_size).copy(order='C').astype(
                                    int)  # ensure C-order contigous
        self._vol_bnds[:,
                       1] = self._vol_bnds[:,
                                           0] + self._vol_dim * self._voxel_size
        self._vol_origin = self._vol_bnds[:, 0].copy(order='C').astype(
            np.float32)  # ensure C-order contigous
        print("Voxel volume size: {:d} x {:d} x {:d}".format(
            self._vol_dim[0], self._vol_dim[1], self._vol_dim[2]))

        # Initialize pointers to voxel volume in CPU memory.
        self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32)
        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(
            np.float32
        )  # for computing the cumulative moving average of observations per voxel
        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)

        # Copy voxel volumes to GPU.
        if TSDF_GPU_MODE:
            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)

            # Cuda kernel function (C++)
            self._cuda_src_mod = SourceModule("""
              __global__ void integrate(float * tsdf_vol,
                                        float * weight_vol,
                                        float * color_vol,
                                        float * vol_dim,
                                        float * vol_origin,
                                        float * cam_intr,
                                        float * cam_pose,
                                        float * other_params,
                                        float * color_im,
                                        float * depth_im) {

                // Get voxel index.
                int gpu_loop_idx = (int) other_params[0];
                int max_threads_per_block = blockDim.x;
                int block_idx = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
                int voxel_idx = gpu_loop_idx * gridDim.x * gridDim.y * gridDim.z * max_threads_per_block + block_idx * max_threads_per_block + threadIdx.x;
                
                int vol_dim_x = (int)vol_dim[0];
                int vol_dim_y = (int)vol_dim[1];
                int vol_dim_z = (int)vol_dim[2];

                if (voxel_idx > vol_dim_x * vol_dim_y * vol_dim_z)
                    return;

                // Get voxel grid coordinates.
                float voxel_x = floorf(((float)voxel_idx) / ((float)(vol_dim_y * vol_dim_z)));
                float voxel_y = floorf(((float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z)) / ((float)vol_dim_z));
                float voxel_z = (float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z - ((int)voxel_y) * vol_dim_z);

                // Voxel grid coordinates to world coordinates.
                float voxel_size = other_params[1];
                float pt_x = vol_origin[0] + voxel_x * voxel_size;
                float pt_y = vol_origin[1] + voxel_y * voxel_size;
                float pt_z = vol_origin[2] + voxel_z * voxel_size;

                // World coordinates to camera coordinates.
                float tmp_pt_x = pt_x - cam_pose[0*4+3];
                float tmp_pt_y = pt_y - cam_pose[1*4+3];
                float tmp_pt_z = pt_z - cam_pose[2*4+3];
                float cam_pt_x = cam_pose[0*4+0] * tmp_pt_x + cam_pose[1*4+0] * tmp_pt_y + cam_pose[2*4+0] * tmp_pt_z;
                float cam_pt_y = cam_pose[0*4+1] * tmp_pt_x + cam_pose[1*4+1] * tmp_pt_y + cam_pose[2*4+1] * tmp_pt_z;
                float cam_pt_z = cam_pose[0*4+2] * tmp_pt_x + cam_pose[1*4+2] * tmp_pt_y + cam_pose[2*4+2] * tmp_pt_z;

                // Camera coordinates to image pixels.
                int pixel_x = (int) roundf(cam_intr[0*3+0] * (cam_pt_x / cam_pt_z) + cam_intr[0*3+2]);
                int pixel_y = (int) roundf(cam_intr[1*3+1] * (cam_pt_y / cam_pt_z) + cam_intr[1*3+2]);

                // Skip if outside view frustum.
                int im_h = (int) other_params[2];
                int im_w = (int) other_params[3];
                if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z < 0)
                    return;

                // Skip invalid depth.
                float depth_value = depth_im[pixel_y*im_w+pixel_x];
                if (depth_value == 0)
                    return;

                // Integrate TSDF.
                float trunc_margin = other_params[4];
                float depth_diff = depth_value-cam_pt_z;
                if (depth_diff < -trunc_margin)
                    return;
                float dist = fmin(1.0f, depth_diff / trunc_margin);
                float w_old = weight_vol[voxel_idx];
                float obs_weight = other_params[5];
                float w_new = w_old + obs_weight;
                weight_vol[voxel_idx] = w_new;
                tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx] * w_old + dist) / w_new;

                // Integrate color.
                float old_color = color_vol[voxel_idx];
                float old_b = floorf(old_color / (256 * 256));
                float old_g = floorf((old_color - old_b * 256 * 256) / 256);
                float old_r = old_color - old_b * 256 * 256 - old_g * 256;
                float new_color = color_im[pixel_y*im_w+pixel_x];
                float new_b = floorf(new_color / (256 * 256));
                float new_g = floorf((new_color - new_b * 256 * 256) / 256);
                float new_r = new_color - new_b * 256 * 256 - new_g * 256;
                new_b = fmin(roundf((old_b*w_old + new_b) / w_new), 255.0f);
                new_g = fmin(roundf((old_g*w_old + new_g) / w_new), 255.0f);
                new_r = fmin(roundf((old_r*w_old + new_r) / w_new), 255.0f);
                color_vol[voxel_idx] = new_b * 256 * 256 + new_g * 256 + new_r;

              }""")

            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")

            # Determine block/grid size on GPU.
            gpu_dev = cuda.Device(0)
            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
            n_blocks = int(
                np.ceil(
                    float(np.prod(self._vol_dim)) /
                    float(self._max_gpu_threads_per_block)))
            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X,
                             int(np.floor(np.cbrt(n_blocks))))
            grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y,
                             int(np.floor(np.sqrt(n_blocks / grid_dim_x))))
            grid_dim_z = min(
                gpu_dev.MAX_GRID_DIM_Z,
                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))))
            self._max_gpu_grid_dim = np.array(
                [grid_dim_x, grid_dim_y, grid_dim_z]).astype(int)
            self._n_gpu_loops = int(
                np.ceil(
                    float(np.prod(self._vol_dim)) / float(
                        np.prod(self._max_gpu_grid_dim) *
                        self._max_gpu_threads_per_block)))
Пример #2
0
    if ndarray.ndim >= 2:
        cuda.TextureReference.set_address_mode(
            tex_ref, 1, address_mode)
    if ndarray.ndim >= 3:
        cuda.TextureReference.set_address_mode(
            tex_ref, 2,  address_mode)
    cuda.TextureReference.set_filter_mode(
        tex_ref, filter_mode)
    tex_ref.set_flags(tex_ref.get_flags(
    ) & ~cuda.TRSF_NORMALIZED_COORDINATES & ~cuda.TRSF_READ_AS_INTEGER)


with open(os.path.join(os.path.dirname(__file__), 'resize.cu')) as f:
    _read_data = f.read()

_mod = SourceModule(_read_data)
_tex_ref = _mod.get_texref('sourceTex')
_kernel = _mod.get_function('resize')


def resize_gpu(src_vol, dst_vol=None, dst_shape=None, scaling=None):

    if dst_shape is None:
        assert scaling
        dst_shape = [np.int(np.round(i * scaling)) for i in src_vol.shape]

    if dst_vol is None:
        dst_vol = gpuarray.GPUArray(dst_shape, np.float32)

    ndarray_to_float_tex(_tex_ref, src_vol)
Пример #3
0
def elmvis(Xraw,
           A,
           slowdown=10,
           report=5,
           maxtime=24 * 60 * 60,
           tol=0,
           batch=None,
           maxiter=None,
           maxupdate=None,
           maxstall=None,
           cossim=None,
           silent=False):
    """ELMVIS+ function running in GPU memory.
    """
    X = Xraw / np.linalg.norm(Xraw, axis=1)[:,
                                            None]  # unit-length version of X
    Xh = np.dot(A, X)  # X_hat, predicted value of X
    N, d = X.shape
    I = np.arange(N)  # index of samples

    # set default values
    if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N
    if maxiter is None: maxiter = N * N * N
    if maxupdate is None: maxupdate = N * N
    if maxstall is None: maxstall = N * N

    if not silent:
        print "original similarity: ", cossim

    # init GPU
    dt = X.dtype.type
    try:
        linalg.init()
    except ImportError as e:
        print e
    devA = gpuarray.to_gpu(A.astype(dt))
    devX = gpuarray.to_gpu(X.astype(dt))
    devXi1 = gpuarray.empty((d, ), dtype=dt)
    devXh = linalg.dot(devA, devX)
    devAi = gpuarray.empty((N, 2), dtype=dt)
    devDelta = gpuarray.empty((2, d), dtype=dt)
    result = gpuarray.empty((d, ), dtype=dt)

    # swap kernel
    kernel = """
        __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) {
            long j = blockDim.x * blockIdx.x + threadIdx.x;
            %s yi1 = Y[i1*d + j];
            %s yi2 = Y[i2*d + j];
            result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) +
                        (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2);
        }
        """
    if dt is np.float64:
        kernel = kernel % ("double", "double", "double", "double", "double",
                           "double")
    else:
        kernel = kernel % ("float", "float", "float", "float", "float",
                           "float")
    mod_diff = SourceModule(kernel)
    dev_diff = mod_diff.get_function("diff")
    dev_diff.prepare("PPPPllll")
    block = result._block
    grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1)

    t0 = tlast = time()
    stall = 0
    iters = 0
    updates = 0
    updates_last = 0
    iters_last = 0
    ups_max = 0

    while (iters < maxiter) and (stall < maxstall):
        iters += 1
        stall += 1

        # get two different random numbers
        i1, i2 = np.random.randint(0, N, size=2)
        while i1 == i2:
            i1, i2 = np.random.randint(0, N, size=2)

        dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata,
                               devXh.gpudata, result.gpudata, d, N, i1, i2)
        diff = np.sum(result.get())

        if diff > tol:
            stall = 0
            devAi[:, 0] = devA[:, i1]
            devAi[:, 1] = devA[:, i2]
            devDelta[0, :] = devX[i1, :] - devX[i2, :]
            devDelta[1, :] = devX[i2, :] - devX[i1, :]
            linalg.add_dot(devAi, devDelta, devXh, alpha=-1)

            tI = I[i1]
            I[i1] = I[i2]
            I[i2] = tI

            devXi1[:] = devX[i1, :]
            devX[i1] = devX[i2]
            devX[i2] = devXi1

            cossim += diff / N
            updates += 1
            if updates > maxupdate:
                break

        t = time()
        if t - tlast > report:
            ups = (updates - updates_last) * 1.0 / (t - tlast)
            ips = (iters - iters_last) * 1.0 / (t - tlast)
            if not silent:
                print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % (
                    iters, updates, ips, ups, cossim)

            updates_last = updates
            iters_last = iters
            tlast = t
            ups_max = max(ups, ups_max)
            if ups < ups_max / slowdown:
                break

        if t - t0 > maxtime:
            break

    ips = iters * 1.0 / (time() - t0)
    ups = updates * 1.0 / (time() - t0)
    Xraw[:] = Xraw[I]

    cossim = np.trace(X.T.dot(A).dot(X)) / N
    if not silent:
        print "final similarity: ", cossim

    info = {
        'cossim': cossim,
        'iters': iters,
        'updates': updates,
        'ips': ips,
        'ups': ups
    }
    return I, info
Пример #4
0
@author: tristan
'''

import os
import numpy as np
from pycuda.compiler import SourceModule
from gpu.gpuSimulation import useCachedKernels

fluxCode = open(
    os.path.join(os.path.dirname(__file__), './fluxCalculations.cu'), 'r')

try:

    # Put the kernel code into a SourceModule
    if useCachedKernels:
        fluxModule = SourceModule(fluxCode.read())
    else:
        fluxModule = SourceModule(fluxCode.read(), cache_dir=False)
    fluxCode.close()

    # Create reference to the specific functions in the SourceModule
    FluxSolverFn = fluxModule.get_function("FluxSolver")
    BuildRFn = fluxModule.get_function("buildRValues")

    # Create callable functions
    def FluxSolver(FluxesGPU, UIntPtsGPU, BottomIntPtsGPU, propSpeedsGPU, m, n,
                   blockDims, gridDims):

        FluxSolverFn(FluxesGPU,
                     UIntPtsGPU,
                     BottomIntPtsGPU,
Пример #5
0
class ImageFilter:

    src_module = """
    __global__ void grayscale_filter(unsigned char *red,
                                     unsigned char *green,
                                     unsigned char *blue,
                                     unsigned int height,
                                     unsigned int width) {
        unsigned int row = threadIdx.y + blockIdx.y * blockDim.y;
        unsigned int col = threadIdx.x + blockIdx.x * blockDim.x;
        unsigned int index = col + row * width;

        // If boundary hit don't keep going (i.e. grid * block exceed max)
        if ((row > height) || (col > width)) return;

        // luminosity method
        unsigned char intensity = static_cast<unsigned char>(
            red[index] * 0.3 + green[index] * 0.59 + blue[index] * 0.11
        );

        red[index] = intensity;
        green[index] = intensity;
        blue[index] = intensity;
    }
    """ # Could have multiple __global__ kernels in here!

    def __init__(self, image_array, dim_block=32):
        self.module = SourceModule(self.src_module)
        self.image_array = image_array
        self.dim_block = dim_block
        self.block_size = dim_block**2  # square block used i.e. (16, 16, 1)
        self.grid_size = None  # last filter grid used
        # number of rows, number of columns, and pixel vector size - here its 4 for rgba
        self.height, self.width, self.pixel_dimension = self.image_array.shape
        self.image_size = self.height * self.width

    @property
    def grayscale(self):
        """ Convert Image to Grayscale: luminosity of -> (0.3 * R) + (0.59 * G) + (0.11 * B) """

        # Copy dimension as to not write over address for future call.
        red = np.copy(self.image_array[:, :, 0])
        green = np.copy(self.image_array[:, :, 1])
        blue = np.copy(self.image_array[:, :, 2])

        # Adjust grid to specified block size
        dim_grid_x = math.ceil(self.width / self.dim_block)
        dim_grid_y = math.ceil(self.height / self.dim_block)

        # Determine max grid
        max_grid_dim_x = pycuda.autoinit.device.get_attribute(
            cuda.device_attribute.MAX_GRID_DIM_X)
        max_grid_dim_y = pycuda.autoinit.device.get_attribute(
            cuda.device_attribute.MAX_GRID_DIM_Y)

        # If grid determined from block size exceeds max grid, we have issues.
        if (max_grid_dim_x * max_grid_dim_y) < (dim_grid_x * dim_grid_y):
            raise ValueError('ERROR :: Image demensions :: Grid exceeds max')

        # for easy sanity check tracking
        self.grid_size = dim_grid_x * dim_grid_y

        # Call specific function from CUDA kernel
        grayscale_filter = self.module.get_function('grayscale_filter')

        # Use grayscale function is specific grid, block, and array for color channels
        grayscale_filter(cuda.InOut(red),
                         cuda.InOut(green),
                         cuda.InOut(blue),
                         np.uint32(self.height),
                         np.uint32(self.width),
                         block=(self.dim_block, self.dim_block, 1),
                         grid=(dim_grid_x, dim_grid_y))

        # Allocates array and it will not take the time to set the element values.
        grayscale_image_array = np.empty_like(self.image_array.copy())
        grayscale_image_array[:, :, 0] = red
        grayscale_image_array[:, :, 1] = green
        grayscale_image_array[:, :, 2] = blue

        return grayscale_image_array
Пример #6
0
	mcopy = cuda.Memcpy3D()
	mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
	mcopy.src_height = mcopy.height = ny
	mcopy.depth = nz

	memcopy(mcopy, set_c(f,(None,-1,-1)), cex_gpu)
	memcopy(mcopy, set_c(f,(-1,None,-1)), cey_gpu)
	memcopy(mcopy, set_c(f,(-1,-1,None)), cez_gpu)
	memcopy(mcopy, set_c(f,(None,0,0)), chx_gpu)
	memcopy(mcopy, set_c(f,(0,None,0)), chy_gpu)
	memcopy(mcopy, set_c(f,(0,0,None)), chz_gpu)

	# prepare kernels
	from pycuda.compiler import SourceModule
	mod = SourceModule(kernels)
	update_e = mod.get_function("update_e")
	update_h = mod.get_function("update_h")
	update_src = mod.get_function("update_src")
	tex = mod.get_texref("tex")
	tey = mod.get_texref("tey")
	tez = mod.get_texref("tez")
	thx = mod.get_texref("thx")
	thy = mod.get_texref("thy")
	thz = mod.get_texref("thz")
	tcex = mod.get_texref("tcex")
	tcey = mod.get_texref("tcey")
	tcez = mod.get_texref("tcez")
	tchx = mod.get_texref("tchx")
	tchy = mod.get_texref("tchy")
	tchz = mod.get_texref("tchz")
Пример #7
0
local_gpu_setup_kernel = pycuda.compiler.SourceModule(
    cuda_full_observables_production.cuda_full_observables_production_code,
    no_extern_c=True).get_function('setup_kernel')
local_rng_states = drv.mem_alloc(
    np.int32(num_blocks * block_dim) * pycuda.characterize.sizeof(
        'curandStateXORWOW', '#include <curand_kernel.h>'))
local_gpu_setup_kernel(np.int32(int(num_blocks * block_dim)),
                       local_rng_states,
                       np.uint64(0),
                       np.uint64(0),
                       grid=(int(num_blocks), 1),
                       block=(int(block_dim), 1, 1))

# get observables function
gpu_observables_func = SourceModule(
    cuda_full_observables_production.cuda_full_observables_production_code,
    no_extern_c=True).get_function(
        'gpu_full_observables_production_with_arrays')

a_selected_sampler = []

for par_name in l_grid_parameters:
    a_selected_sampler.append(a_samples[:, d_parameter_to_index[par_name]])

a_selected_sampler = np.asarray(a_selected_sampler)

print np.mean(a_selected_sampler, axis=1)
print np.cov(a_selected_sampler)

num_bins_s1_th2f = 70
num_bins_s2_th2f = 70
bin_edges_s1_th2 = np.linspace(3, 70, num_bins_s1_th2f + 1)
boundaryConditionsModule = SourceModule("""

    __global__ void applyWallBoundaries(float *meshU, int m, int n)
    {
        int row = blockIdx.y * blockDim.y + threadIdx.y;
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        
        if (row < 2 && col < n)
        {
            meshU[row*n*3 + col*3] = meshU[(3-row)*n*3 + col*3];
            meshU[row*n*3 + col*3 + 2] = -meshU[(3-row)*n*3 + col*3 + 2];
        }
        else if (row > m-3 && row < m && col < n)
        {
            meshU[row*n*3 + col*3] = meshU[(2*m-5-row)*n*3 + col*3];
            meshU[row*n*3 + col*3 + 2] = -meshU[(2*m-5-row)*n*3 + col*3 + 2];
        }
        
        if (col < 2 && row < m)
        {
            meshU[row*n*3 + col*3] = meshU[row*n*3 + (3-col)*3];
            meshU[row*n*3 + col*3 + 1] = -meshU[row*n*3 + (3-col)*3 + 1];
        }
        else if (col > n-3 && col < n && row < m)
        {
            meshU[row*n*3 + col*3] = meshU[row*n*3 + (2*n-5-col)*3];
            meshU[row*n*3 + col*3 + 1] = -meshU[row*n*3 + (2*n-5-col)*3 + 1];
        }
    }
    
    __global__ void applyOpenBoundaries(float *meshU, int m, int n)
    {
        int row = blockIdx.y * blockDim.y + threadIdx.y;
        int col = blockIdx.x * blockDim.x + threadIdx.x;
        
        if (row < 2 && col < n)
        {
            meshU[row*n*3 + col*3] = meshU[2*n*3 + col*3];
            meshU[row*n*3 + col*3 + 1] = meshU[2*n*3 + col*3 + 1];
            meshU[row*n*3 + col*3 + 2] = meshU[2*n*3 + col*3 + 2];
        }
        else if (row > m-3 && row < m && col < n)
        {
            meshU[row*n*3 + col*3] = meshU[(m-3)*n*3 + col*3];
            meshU[row*n*3 + col*3 + 1] = meshU[(m-3)*n*3 + col*3 + 1];
            meshU[row*n*3 + col*3 + 2] = meshU[(m-3)*n*3 + col*3 + 2];
        }
        
        if (col < 2 && row < m)
        {
            meshU[row*n*3 + col*3] = meshU[row*n*3 + 2*3];
            meshU[row*n*3 + col*3 + 1] = meshU[m*n*3 + 2*3 + 1];
            meshU[row*n*3 + col*3 + 2] = meshU[m*n*3 + 2*3 + 2];
        }
        else if (col > n-3 && col < n && row < m)
        {
            meshU[row*n*3 + col*3] = meshU[row*n*3 + (n-3)*3];
            meshU[row*n*3 + col*3 + 1] = meshU[m*n*3 + (n-3)*3 + 1];
            meshU[row*n*3 + col*3 + 2] = meshU[m*n*3 + (n-3)*3 + 2];
        }
    }

""")
Пример #9
0
def write_results(prediction,
                  confidence,
                  num_classes,
                  cuda_code,
                  nms_conf=0.2):
    #confidence == 0.5
    #num_classes = 80
    #prediction = prediction.numpy()
    #print(type(prediction[0,1,0]))
    #tmp =(prediction[:,:,4] > confidence).float().unsqueeze(2)
    #print((prediction[:,:,4] > confidence).shape)
    #shape threshhold from [1,10647] => [1,10647,1]
    #print(tmp.shape)
    threshhold = (prediction[:, :, 4] > confidence)
    #print(threshhold.shape)
    conf_mask = np.expand_dims(threshhold, axis=2)
    #print(conf_mask.shape)
    prediction = prediction * conf_mask
    #print(prediction)
    box_corner = np.empty_like(prediction)
    box_corner[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2] / 2)
    box_corner[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3] / 2)
    box_corner[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2] / 2)
    box_corner[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3] / 2)
    prediction[:, :, :4] = box_corner[:, :, :4]
    print(type(prediction))
    batch_size = prediction.shape[0]

    write = False
    THETA = nms_conf

    for ind in range(batch_size):
        image_pred = prediction[ind]  #image Tensor

        arr = image_pred[:, 5:5 + num_classes]
        #max_conf, max_conf_score = image_pred[:,5:5+num_classes].max(axis= 1)
        max_conf = np.max(arr, axis=1)
        max_conf_score = np.argmax(arr, axis=1)
        #print(max_conf)
        #print(max_conf_score)
        #print(max_conf.shape)
        #print(max_conf_score.shape)
        #break
        max_conf = np.expand_dims(max_conf, axis=1)
        max_conf_score = np.expand_dims(max_conf_score, axis=1)
        max_conf_score = np.asarray(max_conf_score, dtype=np.float32)
        seq = (image_pred[:, :5], max_conf, max_conf_score)
        #print(max_conf)
        #print(max_conf_score)
        #print(max_conf.shape)
        #print(max_conf_score.shape)
        #break
        image_pred = np.concatenate(seq, 1)
        #print(image_pred.shape)
        non_zero_ind = np.nonzero(image_pred[:, 4])
        non_zero_ind = non_zero_ind[0]
        #print(non_zero_ind)
        image_pred_ = image_pred[np.squeeze(non_zero_ind), :]
        try:
            image_pred_ = image_pred[np.squeeze(non_zero_ind), :]
            #print(image_pred_)
        except:
            continue
        #print(image_pred_.shape)
        if image_pred_.shape[0] == 0:
            continue


#

#Get the various classes detected in the image
        img_classes = unique(image_pred_[:,
                                         -1])  # -1 index holds the class index
        #print(img_classes)

        for cls in img_classes:
            #perform NMS

            #get the detections with one particular class
            image_pred_reshaped = np.expand_dims(image_pred_[:, -1] == cls,
                                                 axis=1)
            cls_mask = image_pred_ * image_pred_reshaped
            class_mask_ind = np.nonzero(cls_mask[:, -2])
            class_mask_ind = np.squeeze(class_mask_ind)
            image_pred_class = image_pred_[class_mask_ind]
            print(image_pred_class)

            #sort the detections such that the entry with the maximum objectness
            #confidence is at the top

            conf_sort_index = np.sort(image_pred_class[:, 4])[::-1]
            boxes = image_pred_class[:, 0:4].copy()
            conf_scores = image_pred_class[:, 4].copy()
            #print(conf_sort_index)
            cuda_code = string.Template(cuda_code)
            cuda_code = cuda_code.substitute(THETA=THETA)
            modules = SourceModule(cuda_code)
            # python function will change array's value, so use .copy()
            cuda_start = time.time()
            cuda_results = cuda_nms(modules, boxes, conf_scores, nms_conf)
            cuda_end = time.time()
            print("CUDA results:", cuda_results)
            print("CUDA version takes {} seconds".format(cuda_end -
                                                         cuda_start))
            print(cuda_results)
            index = cuda_results[0]
            image_pred_class = np.expand_dims(image_pred_class[index], axis=0)
            print(image_pred_class)
            batch_ind = np.zeros((image_pred_class.shape[0], 1))
            #print(tmp)
            #batch_ind = tmp.fill_(ind)      #Repeat the batch_id for as many detections of the class cls in the image
            seq = batch_ind, image_pred_class

            if not write:

                output = np.concatenate(seq, 1)
                write = True
            else:
                out = np.concatenate(seq, 1)
                output = np.concatenate([output, out])
    try:
        return output
    except:
        return 0
Пример #10
0
from time import time
# this is a naive parallel prefix-sum kernel that uses shared memory
naive_ker = SourceModule("""
__global__ void naive_prefix(double *vec, double *out)
{
     __shared__ double sum_buf[1024];     
     int tid = threadIdx.x;     
     sum_buf[tid] = vec[tid];
     
     // begin parallel prefix sum algorithm
     
     int iter = 1;
     for (int i=0; i < 10; i++)
     {
         __syncthreads();
         if (tid >= iter )
         {
             sum_buf[tid] = sum_buf[tid] + sum_buf[tid - iter];            
         }
         
         iter *= 2;
     }
         
    __syncthreads();
    out[tid] = sum_buf[tid];
    __syncthreads();
        
}
""")
naive_gpu = naive_ker.get_function("naive_prefix")
mod = SourceModule("""

__device__ void subst(float *a, float *b)
{
     a[0] = b[0];
     a[1] = b[1];
}

__device__ void cadd(float *a, float *b, float *c)
{
    c[0] = a[0] + b[0];
    c[1] = a[1] + b[1];
}

__device__ void cmul(float *a, float *b, float *c)
{
    c[0] = a[0] * b[0] - a[1] * b[1];
    c[1] = a[0] * b[1] + a[1] * b[0];
}

 __global__ void manderblot(float *x, float *y, int *image, int N, int roop, float threshold)
{
    int idx = blockDim.x*blockIdx.x + threadIdx.x;
    float z[2] = {0, 0};
    float _z[2] = {0, 0};
    int i;
    float c[2] = {x[idx % N], y[idx / N]};
    for (i = 0; i < roop; ++i)
    {
        cmul(z,z,_z);
        cadd(_z, c, _z);
        if (_z[0] * _z[0] + _z[1] * _z[1] > threshold * threshold){
            image[idx] = i;
            return;
        }
        subst(z, _z);
    }
    image[idx] = 0;
    return;
}

""")
def _get_lut_bprop_kernel(dtype, deterministic=False):
    """
    Builds the bprop kernel for lookup table layers based on templated code.
    If the deterministic version is requested, an index buffer must be passed
    as an argument. This index buffer re-orders items in the input tensor
    so that word_ids are sorted. This is required since we need to be sure that
    each thread only updates weights for one word id.

    Arguments:
        dtype (np.dtype): The data which the kernel will operate on.
        deterministic (boolean): Builds the deterministic kernel when this is
            set to True.
    """
    if not deterministic:
        code = r"""
__global__ void lut_bprop(
    int* inputs, %(type)s* dW, %(type)s* errors, const int nin,
    const int embedding_dim, const int vocab_size, const int pad_idx)
{
    const int tid  = threadIdx.x;
    const int bid  = blockIdx.x;

    int word_id = inputs[bid];
    int error_row = bid * embedding_dim;
    int output_row = word_id * embedding_dim;

    if(word_id != pad_idx)
    {
        for(int i = tid; i < embedding_dim; i += blockDim.x)
        {
            atomicAdd(&dW[output_row + i], errors[error_row + i]);
        }
    }
}
"""

        code = code % {"type": _ew_types[dtype]["type"]}

        module = SourceModule(code, options=["--use_fast_math"])
        kernel = module.get_function("lut_bprop")
        kernel.prepare("PPPIIIi")
    else:
        code = r"""
__global__ void lut_bprop(
    int* inputs, int* index_buffer, %(type)s* dW, %(type)s* errors,
    const int nin, const int embedding_dim, const int vocab_size,
    const int pad_idx)
{
    const int tid  = threadIdx.x;
    const int bid  = blockIdx.x;

    int index_position = bid;
    int index = index_buffer[index_position];
    int word_id = inputs[index];

    if((bid == 0 || word_id != inputs[index_buffer[bid - 1]]) && word_id != pad_idx)
    {
        int output_row = word_id * embedding_dim;

        do {
            int error_row = index * embedding_dim;

            for(int i = tid; i < embedding_dim; i += blockDim.x)
            {
                dW[output_row + i] += errors[error_row + i];
            }

            index_position++;
            if(index_position == gridDim.x)
            {
                break;
            }
            index = index_buffer[index_position];
        } while(inputs[index] == word_id);
    }
}
"""

        code = code % {"type": _ew_types[dtype]["type"]}

        module = SourceModule(code, options=["--use_fast_math"])
        kernel = module.get_function("lut_bprop")
        kernel.prepare("PPPPIIIi")

    kernel.name = "lut_bprop"
    return kernel
def _get_sorting_kernel(kernel_id, block_size):
    """
    Builds kernels used for sorting inputs. There are several kernels here
    corresponding to the steps in the algorithm. The algorithm works by
    determining the sorted position for each input item. This is done with
    a bucket sort algorithm, where each word_id is a bucket. The first step
    determines the size of each bucket (number of occurences of each word_id).
    Next, a prefix some is computed over the list of bucket sizes to find
    where each bucket will be placed in the output buffer. Finally, each thread
    places it's index into the correct sorted position based on the bucket
    start index (computed from the prefix sum) and that thread's offset into
    the bucket (which is taken from the output of the atomic add done in the
    first step.)

    Arguments:
        kernel_id (Integer): Which step to build the kernel for [0, 4]
        block_size (Integer): Number of threads per block for the prefix sum
            kernels.
    """
    code = r"""
#define THREADS %(threads)s
#define STORE_BLOCKSUM %(store_blocksum)s
__global__ void sort_inputs0(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    const int tid = threadIdx.x + (blockDim.x * blockIdx.x);
    int word_id;

    if(tid < input_length)
    {
        word_id = inputs[tid];
        offset_buffer[tid] = atomicAdd(&word_counts[word_id], 1);
    }
}

__device__ void scan(int* buffer, int* blocksum, int global_length)
{
    const int tid = (threadIdx.x << 1) + 1;
    const int gid = ((threadIdx.x + (blockIdx.x * blockDim.x)) << 1) + 1;

    __shared__ int local_counts[THREADS * 2];
    local_counts[tid] = buffer[gid];
    local_counts[tid - 1] = buffer[gid - 1];

    #pragma unroll
    for(int skip = 1; skip <= THREADS; skip <<= 1)
    {
        int mask = (skip << 1) - 1;
        if((tid & mask) == mask)
        {
            local_counts[tid] += local_counts[tid - skip];
        }

        __syncthreads();
    }

    if(tid == (THREADS * 2 - 1))
    {
#if STORE_BLOCKSUM
        blocksum[blockIdx.x] = local_counts[tid];
#endif
        local_counts[tid] = 0;
    }

    #pragma unroll
    for(int skip = THREADS; skip > 0; skip >>= 1)
    {
        int mask = (skip << 1) - 1;
        if((tid & mask) == mask)
        {
            int temp = local_counts[tid - skip];
            local_counts[tid - skip] = local_counts[tid];
            local_counts[tid] += temp;
        }

        __syncthreads();
    }

    if(gid < global_length)
    {
        buffer[gid] = local_counts[tid];
        buffer[gid - 1] = local_counts[tid - 1];
    }
}

__global__ void sort_inputs1(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    scan(word_counts, word_counts + vocab_size, vocab_size);
}

__global__ void sort_inputs2(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    scan(word_counts + vocab_size, 0, blockDim.x);
}

__global__ void sort_inputs3(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    const int gid = (threadIdx.x + (blockIdx.x * blockDim.x)) << 1;

    if(gid < vocab_size)
    {
        word_counts[gid] += word_counts[vocab_size + blockIdx.x];
        word_counts[gid + 1] += word_counts[vocab_size + blockIdx.x];
    }
}

__global__ void sort_inputs4(
        int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size,
        const int input_length)
{
    const int tid = threadIdx.x + (blockDim.x * blockIdx.x);
    int word_id;

    if(tid < input_length)
    {
        word_id = inputs[tid];
        int sorted_position = word_counts[word_id] + offset_buffer[tid];
        index_buffer[sorted_position] = tid;
    }
}
"""
    code = code % {
        "threads": block_size,
        "store_blocksum": (1 if kernel_id == 1 else 0)
    }
    module = SourceModule(code, options=["--use_fast_math"])

    function_name = "sort_inputs" + str(kernel_id)
    kernel = module.get_function(function_name)
    kernel.prepare("PPPPII")
    kernel.name = "sort_inputs"
    return kernel
Пример #14
0
#===================================================
#===  A = alpha*A + beta*B
#=== input A, B, alpha, beta
#=== output A
#===================================================

mod = SourceModule \
    (
        """
#include<stdio.h>
#include<math.h>
#define INDEX(a, b, yshape) (a)*(yshape) + (b)

__global__ void matrixAddition(float *A,float *B, float alpha, float beta, int ydim)
{
unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y));
  
  unsigned int a      = idx/ydim;
  unsigned int b      = idx%ydim;
  
  A[INDEX(a, b, ydim)] = alpha*A[INDEX(a, b, ydim)]+beta*B[INDEX(a, b, ydim)];

}
"""
    )

matrixAddition = mod.get_function("matrixAddition")


def matAdd(A, B, alpha, beta):
    forme1 = A.shape
Пример #15
0
	cf = np.ones_like(f)*0.5

	ex_gpu = cuda.to_device(f)
	ey_gpu = cuda.to_device(f)
	ez_gpu = cuda.to_device(f)
	hx_gpu = cuda.to_device(f)
	hy_gpu = cuda.to_device(f)
	hz_gpu = cuda.to_device(f)

	cex_gpu = cuda.to_device( set_c(cf,(None,-1,-1)) )
	cey_gpu = cuda.to_device( set_c(cf,(-1,None,-1)) )
	cez_gpu = cuda.to_device( set_c(cf,(-1,-1,None)) )

	# prepare kernels
	from pycuda.compiler import SourceModule
	mod = SourceModule( kernels.replace('TPB',str(tpb)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) )
	update_h = mod.get_function("update_h")
	update_e = mod.get_function("update_e")
	update_src = mod.get_function("update_src")

	update_h.prepare("PPPPPP", block=(tpb,1,1))
	update_e.prepare("PPPPPPPPP", block=(tpb,1,1))
	update_src.prepare("fP", block=(nz,1,1))

	# prepare for plot
	from matplotlib.pyplot import *
	ion()
	imsh = imshow(np.ones((nx,ny),'f').T, cmap=cm.hot, origin='lower', vmin=0, vmax=0.005)
	colorbar()

	# measure kernel execution time
Пример #16
0
        //d_Result[gmemPos] = 128;
        smemPos += smemStride;
        gmemPos += gmemStride;
    }
}
'''

template = string.Template(template)
code = template.substitute(KERNEL_RADIUS=KERNEL_RADIUS,
                           KERNEL_W=KERNEL_W,
                           COLUMN_TILE_H=COLUMN_TILE_H,
                           COLUMN_TILE_W=COLUMN_TILE_W,
                           ROW_TILE_W=ROW_TILE_W,
                           KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)

module = SourceModule(code)
convolutionRowGPU = module.get_function('convolutionRowGPU')
convolutionColumnGPU = module.get_function('convolutionColumnGPU')
d_Kernel_rows = module.get_global('d_Kernel_rows')[0]
d_Kernel_columns = module.get_global('d_Kernel_columns')[0]


# Helper functions for computing alignment...
def iDivUp(a, b):
    # Round a / b to nearest higher integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return (a / b + 1) if (a % b != 0) else (a / b)


def iDivDown(a, b):
Пример #17
0
Sampler = SourceModule("""
#include <stdio.h>

__device__ int cudarand(long long seed)
{
    if (seed == 0)
    {
        seed = 1;
    }
    long long temp=(48271 * seed + 0) % 2147483647;
    return temp;
}

__global__ void rand_Negative_binomial(float *randomseed, int *target, int * matrix_scale,float *r, float *p)
{
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx<matrix_scale[0])
    {
        int seed = cudarand(randomseed[idx] * 2147483647.0);
        seed=cudarand(seed);
        int current_index = idx/matrix_scale[1];
        int suc = 0.0;
        float fail = 0.0;
        int total_r = r[current_index];
        float prob = p[current_index];
        while(total_r>fail)
        {
            seed = cudarand(seed);
            float temp = seed/2147483647.0;
            if (temp<prob)
            {
                suc++;
            }
            else
            {
                fail++;
            }
        }
        target[idx]=suc;
    }
}

""")
        Ts[i_12]+=An[5]/my_factorial;        
        
        
    }
        


    
}
}
"""
try:
    Context.get_device()
except:
    import pycuda.autoinit
mod = SourceModule(krnl, no_extern_c=True)
_gpu_expm = mod.get_function("expm")


def gpu_expm(As, Ts_vectorized, p=12):
    N = len(As)
    if Ts_vectorized.ndim != 2 or Ts_vectorized.shape[1] != 6:
        raise ValueError(Ts_vectorized.shape)

    threadsPerBlock = 512
    nBlocks = int(np.ceil(float(N) / float(threadsPerBlock)))

    _gpu_expm(As.gpu,
              Ts_vectorized.gpu,
              np.int32(N),
              np.int32(p),
Пример #19
0
import pycuda.gpuarray as gpuarray

s = cuda.Event()
e = cuda.Event()
s.record()

code = """
	__global__ void add_one(int n, int start, float *x)
	{
		int index = start + blockIdx.x * blockDim.x + threadIdx.x;
		int stride = blockDim.x * gridDim.x;
		for (int i = index; i < n; i += stride)
			x[i] += 1.0;
	}
	"""
mod = SourceModule(code)
add_one = mod.get_function("add_one")

N = np.int32(1e8)
nStreams = 2
streamSize = np.int32(N / nStreams)

x = np.ones(N, dtype=np.float32)

x_gpu = gpuarray.empty(N, dtype=np.float32)

#cuda.memcpy_htod(x_gpu, x)

stream = []
for i in range(nStreams):
    stream.append(cuda.Stream())
Пример #20
0
def build_sparse_transition_model_at_T(T, T_gpu, vel_data_gpu, params, bDimx, params_gpu, xs_gpu, ys_gpu, ac_angles,
                                       results, sumR_sa, save_file_for_each_a=False):
    gsize = int(params[0])
    num_actions = int(params[1])
    nrzns = int(params[2])

    all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu = vel_data_gpu

    results_gpu_list = []
    sumR_sa_gpu_list = []
    for i in range(num_actions):
        results_gpu_list.append(cuda.mem_alloc(results.nbytes))
        sumR_sa_gpu_list.append(cuda.mem_alloc(sumR_sa.nbytes))
    for i in range(num_actions):
        cuda.memcpy_htod(results_gpu_list[i], results)
        cuda.memcpy_htod(sumR_sa_gpu_list[i], sumR_sa)

    print("alloted mem in inner func")


    # let one thread access a state centre. access coresponding velocities, run all actions
    # TODO: dt may not be int for a genral purpose code

    mod = SourceModule("""
    __device__ int32_t get_thread_idx()
            // assigns idx to thread with which it accesses the flattened 3d vxrzns matrix
            // for a given T and a given action. 
            // runs for both 2d and 3d grid
            // TODO: may have to change this considering cache locality
        {
            // here i, j, k refer to a general matrix M[i][j][k]
            int32_t i = threadIdx.x;
            int32_t j = blockIdx.y;
            int32_t k = blockIdx.x;
            int32_t idx = k + (j*gridDim.x)  + (i*gridDim.x*gridDim.y)+ blockIdx.z*blockDim.x*gridDim.x*gridDim.y;
            return idx;
        }
    __device__ int32_t state1D_from_thread(int32_t T)
    {   
        // j ~ blockIdx.x
        // i ~ blockIdx.y 
        // The above three consitute a spatial state index from i and j of grid
        // last term is for including time index as well.
        return (blockIdx.x + (blockIdx.y*gridDim.x) + (T*gridDim.x*gridDim.y) ); 
    }
    __device__ int32_t state1D_from_ij(int32_t*  posid, int32_t T)
    {
        // posid = {i , j}
        // state id = j + i*dim(i) + T*dim(i)*dim(j)
        return (posid[1] + posid[0]*gridDim.x + (T*gridDim.x*gridDim.y) ) ; 
    }
    __device__ bool is_edge_state(int32_t i, int32_t j)
    {
        // n = gsize -1 that is the last index of the domain assuming square domain
        int32_t n = gridDim.x - 1;
        if (i == 0 || i == n || j == 0 || j == n ) 
            {
                return true;
            }
        else return false;
    }
    __device__ bool is_terminal(int32_t i, int32_t j, float* params)
    {
        int32_t i_term = params[8];         // terminal state indices
        int32_t j_term = params[9];
        if(i == i_term && j == j_term)
        {
            return true;
        }
        else return false;
    }
    __device__ bool my_isnan(int s)
    {
    // By IEEE 754 rule, NaN is not equal to NaN
    return s != s;
    }
    __device__ void get_xypos_from_ij(int32_t i, int32_t j, float* xs, float* ys, float* x, float* y)
    {
        *x = xs[j];
        *y = ys[gridDim.x - 1 - i];
        return;
    }
    __device__ float get_angle_in_0_2pi(float theta)
    {
        float f_pi = 3.141592;
        if (theta < 0)
        {
            return theta + (2*f_pi);
        }
        else
        {
            return theta;
        }  
    }
    __device__ float calculate_reward_const_dt(float* xs, float* ys, int32_t i_old, int32_t j_old, float xold, float yold, int32_t* newposids, float* params, float vnet_x, float vnet_y )
    {
        // xold and yold are centre of old state (i_old, j_old)
        float dt = params[4];
        float r1, r2, theta1, theta2, theta, h;
        float dt_new;
        float xnew, ynew;
        if (newposids[0] == i_old && newposids[1] == j_old)
        {
            dt_new = dt;
        }
        else
        {
            get_xypos_from_ij(newposids[0], newposids[1], xs, ys, &xnew, &ynew); //get centre of new states
            h = sqrtf((xnew - xold)*(xnew - xold) + (ynew - yold)*(ynew - yold));
            r1 = h/(sqrtf((vnet_x*vnet_x) + (vnet_y*vnet_y)));
            theta1 = get_angle_in_0_2pi(atan2f(vnet_y, vnet_x));
            theta2 = get_angle_in_0_2pi(atan2f(ynew - yold, xnew - xold));
            theta = fabsf(theta1 -theta2);
            r2 = fabsf(sinf(theta));
            dt_new = r1 + r2;
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[24] = r1;
                params[25] = r2;
            }
        }
        return -dt_new;
    }
    __device__ void move(float ac_angle, float vx, float vy, float* xs, float* ys, int32_t* posids, float* params, float* r )
    {
            int32_t n = params[0] - 1;      // gsize - 1
            // int32_t num_actions = params[1];
            // int32_t nrzns = params[2];
            float F = params[3];
            float dt = params[4];
            float r_outbound = params[5];
            float r_terminal = params[6];
            float Dj = fabsf(xs[1] - xs[0]);
            float Di = fabsf(ys[1] - ys[0]);
            float r_step = 0;
            *r = 0;
            int32_t i0 = posids[0];
            int32_t j0 = posids[1];
            float vnetx = F*cosf(ac_angle) + vx;
            float vnety = F*sinf(ac_angle) + vy;
            float x, y;
            get_xypos_from_ij(i0, j0, xs, ys, &x, &y); // x, y stores centre coords of state i0,j0
            float xnew = x + (vnetx * dt);
            float ynew = y + (vnety * dt);
            //checks TODO: remove checks once verified
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[12] = x;
                params[13] = y;
                params[14] = vnetx;
                params[15] = vnety;
                params[16] = xnew;
                params[17] = ynew;
                params[18] = ac_angle;
            }
            if (xnew > xs[n])
                {
                    xnew = xs[n];
                    *r += r_outbound;
                }
            else if (xnew < xs[0])
                {
                    xnew = xs[0];
                    *r += r_outbound;
                }
            if (ynew > ys[n])
                {
                    ynew =  ys[n];
                    *r += r_outbound;
                }
            else if (ynew < ys[0])
                {
                    ynew =  ys[0];
                    *r += r_outbound;
                }
            // TODO:xxDONE check logic wrt remainderf. remquof had issue
            int32_t xind, yind;
            //float remx = remquof((xnew - xs[0]), Dj, &xind);
            //float remy = remquof(-(ynew - ys[n]), Di, &yind);
            float remx = remainderf((xnew - xs[0]), Dj);
            float remy = remainderf(-(ynew - ys[n]), Di);
            xind = ((xnew - xs[0]) - remx)/Dj;
            yind = (-(ynew - ys[n]) - remy)/Di;
            if ((remx >= 0.5 * Dj) && (remy >= 0.5 * Di))
                {
                    xind += 1;
                    yind += 1;
                }
            else if ((remx >= 0.5 * Dj && remy < 0.5 * Di))
                {
                    xind += 1;
                }
            else if ((remx < 0.5 * Dj && remy >= 0.5 * Di))
                {
                    yind += 1;
                }
            if (!(my_isnan(xind) || my_isnan(yind)))
                {
                    posids[0] = yind;
                    posids[1] = xind;
                    if (is_edge_state(posids[0], posids[1]))     //line 110
                        {
                            *r += r_outbound;
                        }
                    
                    if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
                    {
                        params[26] = 9999;
                    }
                }
            r_step = calculate_reward_const_dt(xs, ys, i0, j0, x, y, posids, params, vnetx, vnety);
            //TODO: change back to normal when needed
            //r_step = -dt;
            *r += r_step; //TODO: numerical check remaining
            if (is_terminal(posids[0], posids[1], params))
                {
                    *r += r_terminal;
                }
            
            if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1)
            {
                params[19] = xnew;
                params[20] = ynew;
                params[21] = yind;
                params[22] = xind;
                params[23] = *r;
                //params[17] = ynew;
                //params[18] = ac_angle;
            }
    }


    __device__ void extract_velocity(float* vx, float* vy, int32_t T, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float* params)
    {
        int32_t nrzns = params[2];
        int32_t nmodes = params[7];              
        
        int32_t sp_uvi, str_uvi, sp_Yi, str_Yi; //startpoints and strides for accessing all_ui_mat, all_vi_mat and all_Yi
        float sum_x = 0;
        float sum_y = 0;
        float vx_mean, vy_mean;

        //thread index. also used to access resultant vxrzns[nrzns, gsize, gsize]
        int32_t idx = get_thread_idx();

        //rzn index to identify which of the 5k rzn it is. used to access all_Yi.
        int32_t rzn_id = (blockIdx.z * blockDim.x)  + threadIdx.x ;

        //mean_id is the index used to access the flattened all_u_mat[t,i,j].
        int32_t mean_id = state1D_from_thread(T);

        //to access all_ui_mat and all_vi_mat
        str_uvi = gridDim.x * gridDim.y;
        sp_uvi = (T * nmodes * str_uvi) + (gridDim.x * blockIdx.y) + (blockIdx.x);

        // to access all_Yi
        sp_Yi = (T * nrzns * nmodes) + (rzn_id * nmodes);

        vx_mean = all_u_mat[mean_id];
        for(int i = 0; i < nmodes; i++)
        {
            sum_x += all_ui_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i];
        }

        vy_mean = all_v_mat[mean_id];
        for(int i = 0; i < nmodes; i++)
        {
            sum_y += all_vi_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i];
        }
        
        *vx = vx_mean + sum_x;
        *vy = vy_mean + sum_y;
     
        return;
    }


    //test: changer from float* to float ac_angle
    __global__ void transition_calc(float* T_arr, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi,
                                    float ac_angle, float* xs, float* ys, float* params, float* sumR_sa, float* results)
                                            // resutls directions- 1: along S2;  2: along S1;    3: along columns towards count
    {
        int32_t gsize = params[0];          // size of grid along 1 direction. ASSUMING square grid.
        int32_t num_actions = params[1];    
        int32_t nrzns = params[2];
        float F = params[3];
        float dt = params[4];
        float r_outbound = params[5];
        float r_terminal = params[6];
        int32_t nmodes = params[7];              
        int32_t i_term = params[8];         // terminal state indices
        int32_t j_term = params[9];
        int32_t nT = params[10];
        int32_t is_stationary = params[11];
        int32_t T = (int32_t)T_arr[0];
        int32_t idx = get_thread_idx();

        float vx, vy;

        if(idx < gridDim.x*gridDim.y*nrzns)
        {
            int32_t posids[2] = {blockIdx.y, blockIdx.x};    //static declaration of array of size 2 to hold i and j values of S1. 
            int32_t sp_id;      //sp_id is space_id. S1%(gsize*gsize)

            //  Afer move() these will be overwritten by i and j values of S2
            float r;              // to store immediate reward

            extract_velocity(&vx, &vy, T, all_u_mat, all_v_mat, all_ui_mat, all_vi_mat, all_Yi, params);

            //move(*ac_angle, vx, vy, xs, ys, posids, params, &r);
            move(ac_angle, vx, vy, xs, ys, posids, params, &r);
            int32_t S1, S2;
            if (is_stationary == 1)
            {
                T = 0;
                S1 = state1D_from_thread(T);     //get init state number corresponding to thread id
                S2 = state1D_from_ij(posids, T);   //get successor state number corresponding to posid and next timestep T+1        
            }
            else
            {
                S1 = state1D_from_thread(T);     //get init state number corresponding to thread id
                S2 = state1D_from_ij(posids, T+1);   //get successor state number corresponding to posid and next timestep T+1        
                sp_id = S1%(gsize*gsize);
            }
            //writing to sumR_sa. this array will later be divided by num_rzns, to get the avg
            float a = atomicAdd(&sumR_sa[sp_id], r); //TODO: try reduction if this is slow overall
            results[idx] = S2;
            __syncthreads();
            /*if (threadIdx.x == 0 && blockIdx.z == 0)
            {
                sumR_sa[S1] = sumR_sa[S1]/nrzns;    //TODO: change name to R_sa from sumR_sa since were not storing sum anymore
            }
           */
        }//if ends
        return;
    }
        """)

    # sumR_sa2 = np.empty_like(sumR_sa, dtype = np.float32)
    # cuda.memcpy_dtoh(sumR_sa2, sumR_sa_gpu)
    # print("sumR_sa",sumR_sa)
    # print("sumR_sa",sumR_sa2[0:10001])
    # T = np.array(T64, dtype = np.float32)
    params2 = np.empty_like(params).astype(np.float32)
    func = mod.get_function("transition_calc")
    for i in range(num_actions):
        print('T', T, " call kernel for action: ",i)
        func(T_gpu, all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu, ac_angles[i], xs_gpu, ys_gpu, params_gpu, sumR_sa_gpu_list[i], results_gpu_list[i],
             block=(bDimx, 1, 1), grid=(gsize, gsize, (nrzns // bDimx) + 1))
        if i == 0:
            cuda.memcpy_dtoh(params2, params_gpu)
            print("params check:",)
            print(  '\nangle= ', params2[18],
                    '\nx =' ,params2[12],
                '\ny =' ,params2[13] ,
                    '\nvnetx = ',params2[14],
                    '\nvnety =', params2[15],
                    '\nxnew =', params2[16],
                    '\nynew =', params2[17],
                    '\nxnewupd =', params2[19],
                    '\nynewupd =', params2[20],
                    '\nyind i=', params2[21],
                    '\nxind j=', params2[22],
                    '\nr- =', params2[23],
                    '\nr1+ =', params2[24],
                    '\nr2+ =', params2[25],
                    '\nenter_isnan =', params2[26]
                )

    results2_list = []
    sum_Rsa2_list = []
    for i in range(num_actions):
        results2_list.append(np.empty_like(results))
        sum_Rsa2_list.append(np.empty_like(sumR_sa))

    # SYNCHRONISATION - pycuda does it implicitly.

    for i in range(num_actions):
        cuda.memcpy_dtoh(results2_list[i], results_gpu_list[i])
        cuda.memcpy_dtoh(sum_Rsa2_list[i], sumR_sa_gpu_list[i])
        print("memcpy_dtoh for action: ", i)


    for i in range(num_actions):
        sum_Rsa2_list[i] = sum_Rsa2_list[i] / nrzns

    # print("sumR_sa2\n",sumR_sa2,"\n\n")

    # print("results_a0\n",results2_list[0].T[50::int(gsize**2)])
    print("OK REACHED END OF cuda relevant CODE\n")

    # make a list of inputs, each elelment for an action. and run parallal get_coo_ for each action
    # if save_file_for_each_a is true then each file must be named appopriately.
    if save_file_for_each_a == True:
        f1 = 'COO_Highway2D_T' + str(T) + '_a'
        f3 = '_of_' + str(num_actions) + 'A.npy'
        inputs = [(results2_list[i], nrzns, T, f1 + str(i) + f3) for i in range(num_actions)]
    else:
        inputs = [(results2_list[i], nrzns, T, None) for i in range(num_actions)]

    # coo_list_a is a list of coo for each each action for the given timestep.
    with Pool(num_actions) as p:
        coo_list_a = p.starmap(get_COO_, inputs)
    # print("coo print\n", coo.T[4880:4900, :])
    print("\n\n")
    # print("time taken by cuda compute and transfer\n", (t2 - t1) / 60)
    # print("time taken for post processing to coo on cpu\n",(t3 - t2) / 60)

    return coo_list_a, sum_Rsa2_list
    ex_gpu = cuda.to_device(f)
    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    cex_gpu = cuda.to_device(set_c(cf, (None, -1, -1)))
    cey_gpu = cuda.to_device(set_c(cf, (-1, None, -1)))
    cez_gpu = cuda.to_device(set_c(cf, (-1, -1, None)))

    # prepare kernels
    from pycuda.compiler import SourceModule
    mod = SourceModule(
        kernels.replace('TPB', str(tpb)).replace('nyz', str(ny * nz)).replace(
            'nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz)))
    update_h = mod.get_function("update_h")
    update_e = mod.get_function("update_e")
    update_src = mod.get_function("update_src")
    thx = mod.get_texref("thx")
    thy = mod.get_texref("thy")
    thz = mod.get_texref("thz")
    tcex = mod.get_texref("tcex")
    tcey = mod.get_texref("tcey")
    tcez = mod.get_texref("tcez")

    thx.set_address(hx_gpu, f.nbytes)
    thy.set_address(hy_gpu, f.nbytes)
    thz.set_address(hz_gpu, f.nbytes)
    tcex.set_address(cex_gpu, cf.nbytes)
Пример #22
0
mod = SourceModule("""
    #include <math.h>

    // Updates matrix A by adding a scalar c multiplied by another matrix B (i.e. A -= c * B)
    // Only operates on the rows specified by batch: A[batch_j] -= lr * B[batch_i]
    // batch is assumed to 1 x p, A and B have q columns, c is a scalar
    __global__ void BatchMatSubtractInplaceKernel(const int p, const int q, const float c, float *A, 
    const float *B, const int *batch) {
      int batch_index = blockIdx.y * blockDim.y + threadIdx.y;
      if (batch_index >= p) return;
      int row = batch[batch_index];
      int col = blockIdx.x * blockDim.x + threadIdx.x;
      if (col >= q) return;

      atomicAdd(&A[row * q + col], -c * B[row * q + col]);
    }

    // Perform the update step for b/b_tilde using gradient descent
    // a[batch] -= lr * b[batch]
    __global__ void BatchVecSubtractInplaceKernel(const int p, const float lr, float *a, const float *b, const int *batch) {
      int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
      if (batch_index >= p) return;

      int ind = batch[batch_index];
      atomicAdd(&a[ind], -lr * b[ind]);
    }

    // For matrix A and vector b, multiply the i'th row of A by b[i] and store the result into the j'th row of C       
    // Performs this operations only for the corresponding arrays of integers batch_i, batch_j
    // batch is assumed to be 1 x p, A has q columns
    __global__ void BatchMatVecRowMultKernel(const int p, const int q, const float *A, const float *b, float *C, 
    const int *batch_i, const int *batch_j){           
      int batch_index = blockIdx.y * blockDim.y + threadIdx.y;
      if (batch_index >= p) return;
      int col = blockIdx.x * blockDim.x + threadIdx.x;
      if (col >= q) return;                              
      int row_A = batch_i[batch_index];
      int row_C = batch_j[batch_index];                                                  
                                                                                                                       
      C[row_C * q + col] = A[row_A * q + col] * b[batch_index];                                                                    
    }          

    // Copies the values from vector a to b, operating only on item indices specified by batch
    // batch is assumed to be 1 x p
    __global__ void BatchCopyVectorKernel(const int p, const float *a, float *b, const int *batch) {
      int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
      if (batch_index >= p) return;

      int ind = batch[batch_index];
      b[ind] = a[ind];
    }

    // result = np.array([W[i].dot(W_tilde[j]) for i, j in batch])
    // result must start off as a zero array
    __global__ void BatchMatColDotKernel(const int p, const int q, const float *W, const float *W_tilde, 
    const int *batch_i, const int *batch_j, float *result) {
      int batch_index = blockIdx.y * blockDim.y + threadIdx.y;
      if (batch_index >= p) return;
      int col = blockIdx.x * blockDim.x + threadIdx.x;
      if (col >= q) return; 

      int row_W = batch_i[batch_index];
      int row_W_tilde = batch_j[batch_index];

      atomicAdd(&result[batch_index], W[row_W * q + col] * W_tilde[row_W_tilde * q + col]);
    }
    """)
Пример #23
0
    def ping_array(self):
        return self._ping_cuda.array(0,0)

    @property
    def pong_array(self):
        return self._pong_cuda.array(0,0)

Density = Slab(GridWidth, GridHeight, 1, gl.GL_LINEAR)

mod = SourceModule("""
surface<void, 2> surf;
__global__ void kernel(int width, int height)
{
    // Calculate surface coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < 400 && y < 400) {
        float data = x / 400.f;
        // Write to output surface
        surf2Dwrite(data, surf, x*4, y);
    }
}
""")

kernel_function = mod.get_function('kernel')
surface_ref = mod.get_surfref('surf')
# surface_ref.set_array(Density.ping_array,0)
surface_ref.set_array(Density.ping_array)

def Program(fragment):
    program = gloo.Program("vertex_passthrough.vert", fragment, count=4)
    program['Position'] = [(-1,-1), (-1,+1), (+1,-1), (+1,+1)]
def CudaColor(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void co( float *inIm, int check, int color){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
        if(idx*3 < check*3)
        { 
			if(color == 0)
			{
				inIm[idx*3+1] = inIm[idx*3+1]-255;
				inIm[idx*3+2] = inIm[idx*3+2]-255;
			}
			else if(color == 1)
			{
				inIm[idx*3] = inIm[idx*3]-255;
				inIm[idx*3+2] = inIm[idx*3+2]-255;
			}
			else if(color == 2)
			{
				inIm[idx*3] = inIm[idx*3]-255;
				inIm[idx*3+1] = inIm[idx*3+1]-255;
			}
			
			if(inIm[idx*3] < 0)
				inIm[idx*3] = 0;
			if(inIm[idx*3] > 255)
				inIm[idx*3] = 255;
				
			if(inIm[idx*3+1] < 0)
				inIm[idx*3+1] = 0;
			if(inIm[idx*3+1] > 255)
				inIm[idx*3+1] = 255;
				
			if(inIm[idx*3+2] < 0)
				inIm[idx*3+2] = 0;
			if(inIm[idx*3+2] > 255)
				inIm[idx*3+2] = 255;
        }
    }
    """

    color = int(
        raw_input("Enter the color of the filter (0-Red;1-Green;2-Blue): "))
    print
    color = numpy.int32(color)
    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("co")
    func(d_px, checkSize, color, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    coPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(coPx, d_px)
    coPx = (numpy.uint8(coPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(coPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Color Filter"
    print "Image size : ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print
Пример #25
0
def _diameter_kernel():
    """Returns the CUDA kernel to estimate 3D diameter of structures in a 3D local window.
    """
    diameter_kernel_src = """

    texture<float, cudaTextureType3D, cudaReadModeElementType> tex_data;

    __global__ void diameter3d (unsigned int width,
                                unsigned int height,
                                unsigned int depth,
                                const int n_points,
                                const float norm_factor,
                                const int max_iters,
                                const int n_scan_angles,
                                const int *X,
                                const int *Y,
                                const int *Z,
                                const float *scan_angl_arr,
                                const float *azth_data,
                                const float *lat_data,
                                float *radius_arr)
    {
        unsigned long blockId, idx;
        blockId = blockIdx.x + blockIdx.y * gridDim.x;
        idx = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

        if (idx > n_points) {
            return;
        }

        float _x, _y, _z;
        _x = (float)X[idx] + 0.5;
        _y = (float)Y[idx] + 0.5;
        _z = (float)Z[idx] + 0.5;

        // -----------------------------------------
        // Find the diameter

        float azth = azth_data[idx];
        azth += M_PI_2;
        float lat = lat_data[idx];

        float cy = cosf(lat), sy = sinf(lat);
        float cz = cosf(azth), sz = sinf(azth);
        //float cz = -sinf(azth), sz = cosf(azth); // taking into account azth + pi/2

        //vector along fiber
        //float dx = -sy * sz;
        //float dy = cy* sz;
        //float dz = cy;
        float uvec[3] = {0.0, 0.0, 1.0};
        float fiber_vector_x[3] = {0.0,
                                   uvec[2]*cy + uvec[3]*sy,
                                  -uvec[2]*sy + uvec[3]*cy};

        float fiber_vector_z[3] = {fiber_vector_x[0]*cz - fiber_vector_x[1]*sz,
                                   fiber_vector_x[0]*sz + fiber_vector_x[1]*cz,
                                   fiber_vector_x[2]};

        float dx = fiber_vector_z[0], dy = fiber_vector_z[1], dz = fiber_vector_z[2];

        //scan vector perpendicular to a fiber vector (rotation X -> Z)
        float scan_vec[3] = {0, 1, 0}; // unit vector perpendicular to default (0,0,1) direction
        float rot_scan_vec_x[3] = {0.0,
                                   scan_vec[1]*cy + scan_vec[2]*sy,
                                  -scan_vec[1]*sy + scan_vec[2]*cy};


        float rot_scan_vec_z[3] = {rot_scan_vec_x[0]*cz - rot_scan_vec_x[1]*sz,
                                   rot_scan_vec_x[0]*sz + rot_scan_vec_x[1]*cz,
                                   rot_scan_vec_x[2]};

        float out_radius = 0;

        for (int scan_angl_idx = 0; scan_angl_idx < n_scan_angles; scan_angl_idx++) {
            float theta = scan_angl_arr[scan_angl_idx];

            float ct = cosf(theta), st = sinf(theta);
            float x = rot_scan_vec_z[0], y = rot_scan_vec_z[1], z = rot_scan_vec_z[2];
            float u = dx, v = dy, w = dz;

            //rotation of point (x,y,z) around axis (u,v,w)
            float scan_vec_coords[3] =
                        {u*(u*x + v*y + w*z)*(1.0f - ct) + x*ct + (-w*y + v*z)*st,
                         v*(u*x + v*y + w*z)*(1.0f - ct) + y*ct + (w*x - u*z)*st,
                         w*(u*x + v*y + w*z)*(1.0f - ct) + z*ct + (-v*x + u*y)*st};

            float nc[3] = {_x, _y, _z};
            float p[3];

            for (int i = 0; i < max_iters; i++) {
                nc[0] += scan_vec_coords[0];
                nc[1] += scan_vec_coords[1];
                nc[2] += scan_vec_coords[2];

                if (tex3D(tex_data, nc[0], nc[1], nc[2]) == 0) {
                        p[0] = nc[0];
                        p[1] = nc[1];
                        p[2] = nc[2];
                        break;
                }
            }

            out_radius += norm3df(p[0] - _x, p[1] - _y, p[2] - _z);
        }

        radius_arr[idx] = out_radius * norm_factor;
    }
    """

    dm_program = SourceModule(diameter_kernel_src)
    diameter3d = dm_program.get_function("diameter3d")

    return dm_program, diameter3d
def CudaNegative(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void ng( float *inIm, int check ){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
 
        if(idx *3 < check*3)
        { 
        	inIm[idx*3]= 255-inIm[idx*3];
        	inIm[idx*3+1]= 255-inIm[idx*3+1];
        	inIm[idx*3+2]= 255-inIm[idx*3+2];
        }
    }
    """

    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("ng")
    func(d_px, checkSize, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    ngPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(ngPx, d_px)
    ngPx = (numpy.uint8(ngPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(ngPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Negative image"
    print "Image size: ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print
Пример #27
0
def get_kernel():
    kernel = SourceModule("""
        #include <cstdint>
        #include <cfloat>        

        __global__ void distance(
            uint32_t count,     // size of incoming vectors (1D) and outgoing rates (2D)
            float *lats,        // latitude of node N
            float *lons,        // longitude of node N
            float *masses,      // mass (population) of node N
            float p0,           // gravity parameter 0, p0 * (m1 ** p1) * (m2 ** p2) * (distance ** p3)
            float p1,           // gravity parameter 1
            float p2,           // gravity parameter 2
            float p3,           // gravity parameter 3
            float *distances,   // [row,col] rate from node row to node col
            float *rates        // [row,col] rate from node row to node col
        )
        {
            uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
            uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
            const double M_PI = 3.14;

            if ( (x < count) && (y < count) )
            {
                float latx = lats[x] * 3.14159265f / 180;
                float lonx = lons[x] * 3.14159265f / 180;
                float laty = lats[y] * 3.14159265f / 180;
                float lony = lons[y] * 3.14159265f / 180;

                if ( (laty != latx) || (lony != lonx) ) {

                    float popx = masses[x];
                    float popy = masses[y];

                    float a = 6378137;
                    float b = 6356752.3142;
                    float f = 1 / 298.257223563;

                    float L = lony - lonx;

                    float U1 = atan((1 - f) * tan(latx));
                    float U2 = atan((1 - f) * tan(laty));

                    float sinU1 = sin(U1);
                    float cosU1 = cos(U1);
                    float sinU2 = sin(U2);
                    float cosU2 = cos(U2);

                    float lambda = L;
                    float lambdaP = 2 * M_PI;

                    // Used in while iterations
                    float sinlambda = 0.0;
                    float coslambda = 0.0;
                    float sinsigma = 0.0;
                    float cossigma = 0.0;
                    float sinalpha = 0.0;
                    float cossqalpha = 0.0;
                    float cos2sigmam = 0.0;
                    float C = 0.0;
                    float sigma = 0.0;

                    uint32_t iterlimit = 20;

                    float result = FLT_MAX;
                    while (abs(lambda - lambdaP) > 1e-12 && --iterlimit > 0)
                    {
                        sinlambda = sin(lambda);
                        coslambda = cos(lambda);
                        sinsigma = sqrt((cosU2 * sinlambda) * (cosU2 * sinlambda) + (cosU1 * sinU2 - sinU1 * cosU2 * coslambda) * (cosU1 * sinU2 - sinU1 * cosU2 * coslambda));
                        if (sinsigma == 0) {
                            result = 0.0f;
                            break; //co-incident points
                        }
                        cossigma = sinU1 * sinU2 + cosU1 * cosU2 * coslambda;
                        sigma = atan2(sinsigma, cossigma);
                        sinalpha = cosU1 * cosU2 * sinlambda / sinsigma;
                        cossqalpha = 1 - sinalpha * sinalpha;
                        cos2sigmam = cossigma - 2 * sinU1 * sinU2 / cossqalpha;
                        if (isnan(cos2sigmam)) cos2sigmam = 0; //equatorial line: cossqalpha=0
                        C = f / 16 * cossqalpha * (4 + f * (4 - 3 * cossqalpha));
                        lambdaP = lambda;
                        lambda = L + (1 - C) * f * sinalpha * (sigma + C * sinsigma * (cos2sigmam + C * cossigma * (-1 + 2 * cos2sigmam * cos2sigmam)));
                    }

                    if (iterlimit > 0)
                    {
                        if (result != 0.0f)
                        {
                            float uSq = cossqalpha * (a * a - b * b) / (b * b);
                            float A = 1 + uSq / 16384 * (4096 + uSq * (-768 + uSq * (320 - 175 * uSq)));
                            float B = uSq / 1024 * (256 + uSq * (-128 + uSq * (74 - 47 * uSq)));
                            float deltasigma = B * sinsigma * (cos2sigmam + B / 4 * (cossigma * (-1 + 2 * cos2sigmam * cos2sigmam) - B / 6 * cos2sigmam * (-3 + 4 * sinsigma * sinsigma) * (-3 + 4 * cos2sigmam * cos2sigmam)));
                            float s = b * A * (sigma - deltasigma);

                            result = float(s / 1000);
                        }
                    }
                    else
                    {
                        result = float(nan(""));
                    }

                    float dist = result;
                    distances[y * count + x] = dist;
                    float rate = (dist != 0.0f) ? p0 * powf(popx, p1) * powf(popy, p2) * powf(dist, p3) : 0.0f;
                    rates[y * count + x] = rate;

                    // printf(\"Calculating %d (%f,%f:%d) -> %d (%f, %f:%d) = %f (%f)\\n\", x, latx, lonx, int(popx), y, laty, lony, int(popy), dist, rate);
                }
                else
                {
                    // printf(\"Nodes %d and %d have the same lat/long.\\n\", x, y);
                    distances[y * count + x] = 0.0f;
                    rates[y * count + x] = 0.0f;
                }
            }

            return;
        }
    """)

    kernel_fn = kernel.get_function("distance")

    return kernel_fn
def CudaBrightness(inPath, outPath):

    totalT0 = time.clock()

    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)

    getAndConvertT1 = time.clock()

    allocT0 = time.clock()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)

    allocT1 = time.clock()

    #Kernel declaration
    kernelT0 = time.clock()

    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024, 1, 1)
    checkSize = numpy.int32(im.size[0] * im.size[1])
    grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)

    #Kernel text
    kernel = """
 
    __global__ void br( float *inIm, int check, int brightness ){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
        if(idx *3 < check*3)
        { 
			if(inIm[idx*3]+brightness > 255)
				inIm[idx*3] = 255;
			else 
        		inIm[idx*3]= inIm[idx*3]+brightness;
        	
        	if(inIm[idx*3+1]+brightness > 255)
				inIm[idx*3+1] = 255;
			else 
        		inIm[idx*3+1]= inIm[idx*3+1]+brightness;
        	
        	if(inIm[idx*3+2]+brightness > 255)
				inIm[idx*3+2] = 255;
			else 
        		inIm[idx*3+2]= inIm[idx*3+2]+brightness;
        }
    }
    """

    brightness = int(
        raw_input("Enter the level of brightness (-255 to 255): "))
    print
    if brightness > 255:
        brightness = 255
    if brightness < -255:
        brightness = -255
    brightness = numpy.int32(brightness)
    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("br")
    func(d_px, checkSize, brightness, block=block, grid=grid)

    kernelT1 = time.clock()

    #Get back data from gpu
    backDataT0 = time.clock()

    brPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(brPx, d_px)
    brPx = (numpy.uint8(brPx))

    backDataT1 = time.clock()

    #Save image
    storeImageT0 = time.clock()
    pil_im = Image.fromarray(brPx, mode="RGB")

    pil_im.save(outPath)

    totalT1 = time.clock()

    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime = totalT1 - storeImageT0
    totalTime = totalT1 - totalT0

    print "Brightness filter"
    print "Image size : ", im.size
    print "Time taken to get and convert image data: ", getAndConvertTime
    print "Time taken to allocate memory on the GPU: ", allocTime
    print "Kernel execution time: ", kernelTime
    print "Time taken to get image data from GPU and convert it: ", backDataTime
    print "Time taken to save the image: ", storeImageTime
    print "Total execution time: ", totalTime
    print
Пример #29
0
	__device__ int idint(){
		const int blkId=blockIdx.x+blockIdx.y*gridDim.x+blockIdx.z*gridDim.x*gridDim.y;
		return threadIdx.x+threadIdx.y*blockDim.x+threadIdx.z*blockDim.x*blockDim.y+blkId*blockDim.x*blockDim.y*blockDim.z;
	}
	__global__ void test(Gfloat* outs, const Gfloat* ins, const int* sizes)
	{
		int thdId=idint();
		if (thdId >= sizes[0]) {
			return;
		}
		outs[thdId] *= outs[thdId] * ins[thdId];
		__syncthreads();
		__shared__ int sharedata[128];
	}
"""
mod = SourceModule(source)
test = mod.get_function("test")
size = 100
a = np.arange(size, dtype=np.float64)
b = np.arange(size, dtype=np.float64)
outs = drv.InOut(a)
ins = drv.In(b)
sizes = drv.In(np.array([size], dtype=np.int32))
test(outs, ins, sizes, block=(size, 1, 1), grid=(1, 1))
print("resule:", a.shape)


def img2arr(img):
    if type(img) in [str, unicode]:
        import PIL.Image
        img = PIL.Image.open(img)
Пример #30
0
class TSDFVolume(object):
    def __init__(self, vol_bnds, voxel_size):

        # Define voxel volume parameters.
        self._vol_bnds = vol_bnds  # 3x2, rows: (x, y, z), columns: (min, max) in world coordinates in meters
        self._voxel_size = voxel_size  # in meters (determines volume discretization and resolution)
        self._trunc_margin = self._voxel_size * 5  # truncation on SDF

        # Adjust volume bounds.
        self._vol_dim = np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) /
                                self._voxel_size).copy(order='C').astype(
                                    int)  # ensure C-order contigous
        self._vol_bnds[:,
                       1] = self._vol_bnds[:,
                                           0] + self._vol_dim * self._voxel_size
        self._vol_origin = self._vol_bnds[:, 0].copy(order='C').astype(
            np.float32)  # ensure C-order contigous
        print("Voxel volume size: {:d} x {:d} x {:d}".format(
            self._vol_dim[0], self._vol_dim[1], self._vol_dim[2]))

        # Initialize pointers to voxel volume in CPU memory.
        self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32)
        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(
            np.float32
        )  # for computing the cumulative moving average of observations per voxel
        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)

        # Copy voxel volumes to GPU.
        if TSDF_GPU_MODE:
            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)

            # Cuda kernel function (C++)
            self._cuda_src_mod = SourceModule("""
              __global__ void integrate(float * tsdf_vol,
                                        float * weight_vol,
                                        float * color_vol,
                                        float * vol_dim,
                                        float * vol_origin,
                                        float * cam_intr,
                                        float * cam_pose,
                                        float * other_params,
                                        float * color_im,
                                        float * depth_im) {

                // Get voxel index.
                int gpu_loop_idx = (int) other_params[0];
                int max_threads_per_block = blockDim.x;
                int block_idx = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
                int voxel_idx = gpu_loop_idx * gridDim.x * gridDim.y * gridDim.z * max_threads_per_block + block_idx * max_threads_per_block + threadIdx.x;
                
                int vol_dim_x = (int)vol_dim[0];
                int vol_dim_y = (int)vol_dim[1];
                int vol_dim_z = (int)vol_dim[2];

                if (voxel_idx > vol_dim_x * vol_dim_y * vol_dim_z)
                    return;

                // Get voxel grid coordinates.
                float voxel_x = floorf(((float)voxel_idx) / ((float)(vol_dim_y * vol_dim_z)));
                float voxel_y = floorf(((float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z)) / ((float)vol_dim_z));
                float voxel_z = (float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z - ((int)voxel_y) * vol_dim_z);

                // Voxel grid coordinates to world coordinates.
                float voxel_size = other_params[1];
                float pt_x = vol_origin[0] + voxel_x * voxel_size;
                float pt_y = vol_origin[1] + voxel_y * voxel_size;
                float pt_z = vol_origin[2] + voxel_z * voxel_size;

                // World coordinates to camera coordinates.
                float tmp_pt_x = pt_x - cam_pose[0*4+3];
                float tmp_pt_y = pt_y - cam_pose[1*4+3];
                float tmp_pt_z = pt_z - cam_pose[2*4+3];
                float cam_pt_x = cam_pose[0*4+0] * tmp_pt_x + cam_pose[1*4+0] * tmp_pt_y + cam_pose[2*4+0] * tmp_pt_z;
                float cam_pt_y = cam_pose[0*4+1] * tmp_pt_x + cam_pose[1*4+1] * tmp_pt_y + cam_pose[2*4+1] * tmp_pt_z;
                float cam_pt_z = cam_pose[0*4+2] * tmp_pt_x + cam_pose[1*4+2] * tmp_pt_y + cam_pose[2*4+2] * tmp_pt_z;

                // Camera coordinates to image pixels.
                int pixel_x = (int) roundf(cam_intr[0*3+0] * (cam_pt_x / cam_pt_z) + cam_intr[0*3+2]);
                int pixel_y = (int) roundf(cam_intr[1*3+1] * (cam_pt_y / cam_pt_z) + cam_intr[1*3+2]);

                // Skip if outside view frustum.
                int im_h = (int) other_params[2];
                int im_w = (int) other_params[3];
                if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z < 0)
                    return;

                // Skip invalid depth.
                float depth_value = depth_im[pixel_y*im_w+pixel_x];
                if (depth_value == 0)
                    return;

                // Integrate TSDF.
                float trunc_margin = other_params[4];
                float depth_diff = depth_value-cam_pt_z;
                if (depth_diff < -trunc_margin)
                    return;
                float dist = fmin(1.0f, depth_diff / trunc_margin);
                float w_old = weight_vol[voxel_idx];
                float obs_weight = other_params[5];
                float w_new = w_old + obs_weight;
                weight_vol[voxel_idx] = w_new;
                tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx] * w_old + dist) / w_new;

                // Integrate color.
                float old_color = color_vol[voxel_idx];
                float old_b = floorf(old_color / (256 * 256));
                float old_g = floorf((old_color - old_b * 256 * 256) / 256);
                float old_r = old_color - old_b * 256 * 256 - old_g * 256;
                float new_color = color_im[pixel_y*im_w+pixel_x];
                float new_b = floorf(new_color / (256 * 256));
                float new_g = floorf((new_color - new_b * 256 * 256) / 256);
                float new_r = new_color - new_b * 256 * 256 - new_g * 256;
                new_b = fmin(roundf((old_b*w_old + new_b) / w_new), 255.0f);
                new_g = fmin(roundf((old_g*w_old + new_g) / w_new), 255.0f);
                new_r = fmin(roundf((old_r*w_old + new_r) / w_new), 255.0f);
                color_vol[voxel_idx] = new_b * 256 * 256 + new_g * 256 + new_r;

              }""")

            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")

            # Determine block/grid size on GPU.
            gpu_dev = cuda.Device(0)
            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
            n_blocks = int(
                np.ceil(
                    float(np.prod(self._vol_dim)) /
                    float(self._max_gpu_threads_per_block)))
            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X,
                             int(np.floor(np.cbrt(n_blocks))))
            grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y,
                             int(np.floor(np.sqrt(n_blocks / grid_dim_x))))
            grid_dim_z = min(
                gpu_dev.MAX_GRID_DIM_Z,
                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))))
            self._max_gpu_grid_dim = np.array(
                [grid_dim_x, grid_dim_y, grid_dim_z]).astype(int)
            self._n_gpu_loops = int(
                np.ceil(
                    float(np.prod(self._vol_dim)) / float(
                        np.prod(self._max_gpu_grid_dim) *
                        self._max_gpu_threads_per_block)))

    def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.):
        im_h = depth_im.shape[0]
        im_w = depth_im.shape[1]

        # Fold RGB color image into a single channel image.
        color_im = color_im.astype(np.float32)
        color_im = np.floor(color_im[:, :, 2] * 256 * 256 +
                            color_im[:, :, 1] * 256 + color_im[:, :, 0])

        # GPU mode: integrate voxel volume (calls CUDA kernel).
        if TSDF_GPU_MODE:
            for gpu_loop_idx in range(self._n_gpu_loops):
                self._cuda_integrate(
                    self._tsdf_vol_gpu,
                    self._weight_vol_gpu,
                    self._color_vol_gpu,
                    cuda.InOut(self._vol_dim.astype(np.float32)),
                    cuda.InOut(self._vol_origin.astype(np.float32)),
                    cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
                    cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
                    cuda.InOut(
                        np.asarray([
                            gpu_loop_idx, self._voxel_size, im_h, im_w,
                            self._trunc_margin, obs_weight
                        ], np.float32)),
                    cuda.InOut(color_im.reshape(-1).astype(np.float32)),
                    cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
                    block=(self._max_gpu_threads_per_block, 1, 1),
                    grid=(int(self._max_gpu_grid_dim[0]),
                          int(self._max_gpu_grid_dim[1]),
                          int(self._max_gpu_grid_dim[2])))

        # CPU mode: integrate voxel volume (vectorized implementation).
        else:

            # Get voxel grid coordinates.
            xv, yv, zv = np.meshgrid(range(self._vol_dim[0]),
                                     range(self._vol_dim[1]),
                                     range(self._vol_dim[2]),
                                     indexing='ij')
            vox_coords = np.concatenate(
                (xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)),
                axis=0).astype(int)

            # Voxel coordinates to world coordinates.
            world_pts = self._vol_origin.reshape(
                -1, 1) + vox_coords.astype(float) * self._voxel_size

            # World coordinates to camera coordinates.
            world2cam = np.linalg.inv(cam_pose)
            cam_pts = np.dot(world2cam[:3, :3], world_pts) + np.tile(
                world2cam[:3, 3].reshape(3, 1), (1, world_pts.shape[1]))

            # Camera coordinates to image pixels.
            pix_x = np.round(cam_intr[0, 0] * (cam_pts[0, :] / cam_pts[2, :]) +
                             cam_intr[0, 2]).astype(int)
            pix_y = np.round(cam_intr[1, 1] * (cam_pts[1, :] / cam_pts[2, :]) +
                             cam_intr[1, 2]).astype(int)

            # Skip if outside view frustum.
            valid_pix = np.logical_and(
                pix_x >= 0,
                np.logical_and(
                    pix_x < im_w,
                    np.logical_and(
                        pix_y >= 0,
                        np.logical_and(pix_y < im_h, cam_pts[2, :] > 0))))

            depth_val = np.zeros(pix_x.shape)
            depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]

            # Integrate TSDF.
            depth_diff = depth_val - cam_pts[2, :]
            valid_pts = np.logical_and(depth_val > 0,
                                       depth_diff >= -self._trunc_margin)
            dist = np.minimum(1., np.divide(depth_diff, self._trunc_margin))
            w_old = self._weight_vol_cpu[vox_coords[0, valid_pts],
                                         vox_coords[1, valid_pts],
                                         vox_coords[2, valid_pts]]
            w_new = w_old + obs_weight
            self._weight_vol_cpu[vox_coords[0, valid_pts],
                                 vox_coords[1, valid_pts],
                                 vox_coords[2, valid_pts]] = w_new
            tsdf_vals = self._tsdf_vol_cpu[vox_coords[0, valid_pts],
                                           vox_coords[1, valid_pts],
                                           vox_coords[2, valid_pts]]
            self._tsdf_vol_cpu[vox_coords[0, valid_pts], vox_coords[1,
                                                                    valid_pts],
                               vox_coords[2, valid_pts]] = np.divide(
                                   np.multiply(tsdf_vals, w_old) +
                                   dist[valid_pts], w_new)

            # Integrate color.
            old_color = self._color_vol_cpu[vox_coords[0, valid_pts],
                                            vox_coords[1, valid_pts],
                                            vox_coords[2, valid_pts]]
            old_b = np.floor(old_color / (256. * 256.))
            old_g = np.floor((old_color - old_b * 256. * 256.) / 256.)
            old_r = old_color - old_b * 256. * 256. - old_g * 256.
            new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
            new_b = np.floor(new_color / (256. * 256.))
            new_g = np.floor((new_color - new_b * 256. * 256.) / 256.)
            new_r = new_color - new_b * 256. * 256. - new_g * 256.
            new_b = np.minimum(
                np.round(np.divide(np.multiply(old_b, w_old) + new_b, w_new)),
                255.)
            new_g = np.minimum(
                np.round(np.divide(np.multiply(old_g, w_old) + new_g, w_new)),
                255.)
            new_r = np.minimum(
                np.round(np.divide(np.multiply(old_r, w_old) + new_r, w_new)),
                255.)
            self._color_vol_cpu[vox_coords[0, valid_pts], vox_coords[
                1, valid_pts], vox_coords[
                    2, valid_pts]] = new_b * 256. * 256. + new_g * 256. + new_r

    # Copy voxel volume to CPU.
    def get_volume(self):
        if TSDF_GPU_MODE:
            cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
            cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
        return self._tsdf_vol_cpu, self._color_vol_cpu

    # Get mesh of voxel volume via marching cubes.
    def get_mesh(self):
        tsdf_vol, color_vol = self.get_volume()

        # Marching cubes.
        verts, faces, norms, _ = measure.marching_cubes_lewiner(tsdf_vol,
                                                                level=0)
        verts_ind = np.round(verts).astype(int)
        verts = verts * self._voxel_size + self._vol_origin  # voxel grid coordinates to world coordinates

        # Get vertex colors.
        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
        colors_b = np.floor(rgb_vals / (256 * 256))
        colors_g = np.floor((rgb_vals - colors_b * 256 * 256) / 256)
        colors_r = rgb_vals - colors_b * 256 * 256 - colors_g * 256
        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
        colors = colors.astype(np.uint8)
        return verts, faces, norms, colors