示例#1
0
def testBatchMatColDotKernel():
    batch_size = 128
    num_rows = 1000
    num_cols = 200
    c = 0.05

    a_cpu = np.random.rand(num_rows, num_cols).astype('f')
    b_cpu = np.random.rand(num_rows, num_cols).astype('f')
    batch_i = np.random.choice(np.arange(num_rows, dtype=np.int32),
                               size=batch_size,
                               replace=False)
    batch_j = np.random.choice(np.arange(num_rows, dtype=np.int32),
                               size=batch_size,
                               replace=False)
    batch = zip(batch_i, batch_j)

    a_gpu = gpuarray.to_gpu(a_cpu)
    b_gpu = gpuarray.to_gpu(b_cpu)
    batch_i_gpu = gpuarray.to_gpu(batch_i)
    batch_j_gpu = gpuarray.to_gpu(batch_j)
    result = gpuarray.zeros(batch_size, dtype=np.float32)

    func = mod.get_function("BatchMatColDotKernel")
    func(np.int32(batch_size), np.int32(num_cols), a_gpu, b_gpu, batch_i_gpu, batch_j_gpu, result, \
      block=(num_threads_x, num_threads_y, 1), grid=(num_blocks_x, num_blocks_y))

    context.synchronize()

    gpu_result = result.get()
    actual_result = np.array([a_cpu[i].dot(b_cpu[j]) for i, j in batch])
    assertResultsClose(gpu_result, actual_result)
示例#2
0
def testBatchCopyVectorKernel():
    batch_size = 128
    num_elems = 1000

    a_cpu = np.random.rand(num_elems).astype('f')
    b_cpu = np.random.rand(num_elems).astype('f')
    batch = np.random.choice(np.arange(num_elems, dtype=np.int32),
                             size=batch_size,
                             replace=False)

    a_gpu = gpuarray.to_gpu(a_cpu)
    b_gpu = gpuarray.to_gpu(b_cpu)
    batch_gpu = gpuarray.to_gpu(batch)

    func = mod.get_function("BatchCopyVectorKernel")
    func(np.int32(batch_size), a_gpu, b_gpu, batch_gpu, block=(num_threads_x * num_threads_y, 1, 1), \
      grid=(num_blocks_x * num_blocks_y, 1))

    context.synchronize()

    b_cpu[batch] = a_cpu[batch]

    gpu_result = b_gpu.get()
    actual_result = b_cpu
    assertResultsClose(gpu_result, actual_result)
示例#3
0
def testBatchMatVecRowMultKernel():
    batch_size = 128
    num_rows = 1000
    num_cols = 200

    a_cpu = np.random.rand(num_rows, num_cols).astype('f')
    b_cpu = np.random.rand(batch_size).astype('f')
    c_cpu = np.random.rand(num_rows, num_cols).astype('f')
    batch_i = np.random.choice(np.arange(num_rows, dtype=np.int32),
                               size=batch_size,
                               replace=False)
    batch_j = np.random.choice(np.arange(num_rows, dtype=np.int32),
                               size=batch_size,
                               replace=False)

    a_gpu = gpuarray.to_gpu(a_cpu)
    b_gpu = gpuarray.to_gpu(b_cpu)
    batch_i_gpu = gpuarray.to_gpu(batch_i)
    batch_j_gpu = gpuarray.to_gpu(batch_j)
    c_gpu = gpuarray.to_gpu(c_cpu)

    func = mod.get_function("BatchMatVecRowMultKernel")
    func(np.int32(batch_size), np.int32(num_cols), a_gpu, b_gpu, c_gpu, batch_i_gpu, batch_j_gpu, \
      block=(num_threads_x, num_threads_y, 1), grid=(num_blocks_x, num_blocks_y))

    context.synchronize()

    c_cpu[batch_j] = (a_cpu[batch_i].T * b_cpu).T

    gpu_result = c_gpu.get()
    actual_result = c_cpu
    assertResultsClose(gpu_result, actual_result)
示例#4
0
def testBatchVecSubtractInplaceKernel():
    batch_size = 128
    num_elems = 1000
    c = 0.05

    a_cpu = np.random.rand(num_elems).astype('f')
    b_cpu = np.random.rand(num_elems).astype('f')
    batch = np.random.choice(np.arange(num_elems, dtype=np.int32),
                             size=batch_size,
                             replace=False)

    a_gpu = gpuarray.to_gpu(a_cpu)
    b_gpu = gpuarray.to_gpu(b_cpu)
    batch_gpu = gpuarray.to_gpu(batch)

    func = mod.get_function("BatchVecSubtractInplaceKernel")
    func(np.int32(batch_size), np.float32(c), a_gpu, b_gpu, batch_gpu, \
      block=(num_threads_x, 1, 1), grid=(num_blocks_x, 1))

    context.synchronize()

    a_cpu[batch] -= c * b_cpu[batch]

    gpu_result = a_gpu.get()
    actual_result = a_cpu
    assertResultsClose(gpu_result, actual_result)
示例#5
0
    def ring_allreduce(self, send, recv, op=None):
        op = self.get_op(op)
        send_buff = self.buff(send)
        recv_buff = self.buff(recv)
        accum = self.temp(send.shape)
        accum[:] = send[:]
        context.synchronize()

        left = ((self.rank - 1) + self.size) % self.size
        right = (self.rank + 1) % self.size

        for i in range(self.size - 1):
            if i % 2 == 0:
                # Send send_buff
                send_req = self.comm.Isend(send_buff, dest=right)
                self.comm.Recv(recv_buff, source=left)
                # accum[:] += recv[:]
                op(recv, accum)
            else:
                # Send recv_buff
                send_req = self.comm.Isend(recv_buff, dest=right)
                self.comm.Recv(send_buff, source=left)
                # accum[:] += send[:]
                op(send, accum)
            send_req.Wait()
        context.synchronize()
        recv[:] = accum[:]
示例#6
0
    def generateSignals(self):
        starttime = time.time()
        parameterCombinations = []
        ffVector = np.linspace(0, 1, self.NFF)
        for ffInd in range(len(ffVector)):
            for t2Ind in range(len(self.T2Points)):
                for b1Ind in range(len(self.B1Points)):
                    parameterCombinations.append(
                        (self.T2Points[t2Ind], self.B1Points[b1Ind],
                         ffVector[ffInd]))

        parameterCombinations = np.array(parameterCombinations,
                                         dtype=np.float32)

        nParams = parameterCombinations.shape[0]

        signalsOut_gpu = ga.zeros((nParams * self.NEchoes), np.float32)

        print("Compiling/loading CUDA module...")
        cuda_cpmg = getCudaFunction(self.NEchoes, self.EchoSpacing, self.T1f,
                                    self.T1w, self.MagPreparePulse)
        print("Generating signals...")

        params_gpu = ga.to_gpu(parameterCombinations.ravel())
        sp90_gpu = ga.to_gpu(self.sliceProf90.squeeze().astype(np.float32))
        sp180_gpu = ga.to_gpu(self.sliceProf180.squeeze().astype(np.float32))

        nBlocks = int(np.ceil(float(nParams) / self.CudaBlockSize))

        cuda_cpmg(np.uint32(nParams),
                  np.uint32(self.sliceProf90.shape[0]),
                  np.float32(self.fatT2),
                  sp90_gpu,
                  sp180_gpu,
                  params_gpu,
                  signalsOut_gpu,
                  block=(self.CudaBlockSize, 1, 1),
                  grid=(nBlocks, 1))

        signalsOut_gpu = signalsOut_gpu.reshape((nParams, self.NEchoes))

        context.synchronize()

        signalsOut = signalsOut_gpu.get()

        print("Done")
        print("Time taken:", time.time() - starttime)

        params_gpu.gpudata.free()
        sp90_gpu.gpudata.free()
        sp180_gpu.gpudata.free()

        self.allSignals = signalsOut
        self.parameterCombinations = parameterCombinations
        self.signalsReady = True
示例#7
0
 def insert(self, x):
     context.synchronize()
     if TESTGPU:
         [t1, t2] = self.forwardGPU(x)
         [self.out, self.deriv] = self.forwardCPU(x)
         for i in range(len(self.out)):
             assert np.fabs(t1[i] - self.out[i]) < TOL
             assert np.fabs(t2[i] - self.deriv[i]) < TOL
     elif GPU:
         [self.out, self.deriv] = self.forwardGPU(x)
     else:
         [self.out, self.deriv] = self.forwardCPU(x)
     context.synchronize()
     return self.out
示例#8
0
def compute(volume, offset):
    bsize = (32, 32, 1)
    gsize = (int(volume[0] / bsize[0]), int(volume[1] / bsize[1]),
             int(volume[2]))

    DEFINES = '\n#define SCALE ' + str(1) + \
     '\n#define WIDTH ' + str(volume[0]) + \
     '\n#define HEIGHT ' + str(volume[1]) + \
     '\n#define bwidth ' + str(bsize[0]) + \
     '\n#define bheight ' + str(bsize[1]) + \
     '\n#define offx ' + str(-offset[0]) + \
     '\n#define offy ' + str(-offset[1]) + \
     '\n#define offz ' + str(-offset[2]) + \
     '\n#define DEPTH ' + str(volume[2]) + \
     '\n#define bdepth ' + str(1) + '\n' # inutile

    # Non optimal method
    path = os.path.split(__file__)[0] + '/cuda/'

    kernel_cu = open(path + 'kernel.cu', 'r')
    kernel_buf = kernel_cu.read()

    # Load complex2.cu
    complex2_cu = open(path + 'complex2.cu', 'r')
    complex2_buf = complex2_cu.read()

    # Load vectors.cu
    vectors_cu = open(path + 'vectors.cu', 'r')
    vectors_buf = vectors_cu.read()

    # Import cu files inside the kernel and copy the defines
    cu_buffer = vectors_buf + '\n' + complex2_buf
    kernel_buf = kernel_buf.replace('%DEFINES%',
                                    DEFINES).replace('%CUFILES%', cu_buffer)

    mod = SourceModule(kernel_buf,
                       "nvcc",
                       include_dirs=["/usr/local/cuda/include"],
                       no_extern_c=True)
    compute = mod.get_function("compute")

    # Array di uscita
    dest = np.zeros(volume[0] * volume[1] * volume[2]).astype(np.float32)

    compute(drv.Out(dest), block=bsize, grid=gsize)
    context.synchronize()

    return dest
示例#9
0
def findmax_gpu(corrMatrix_gpu, ffValues_gpu, ffParams_gpu):
    nVoxels = ffValues_gpu.shape[0]
    nParams = ffParams_gpu.shape[0]
    indexOut_gpu = ga.zeros((nVoxels), np.int32)
    nBlocks = int(np.ceil(float(nVoxels) / CUDA_BLOCK_SIZE))
    findmax_gpu_fn(corrMatrix_gpu,
                   ffValues_gpu,
                   np.uint32(nVoxels),
                   ffParams_gpu,
                   np.uint32(nParams),
                   indexOut_gpu,
                   block=(CUDA_BLOCK_SIZE, 1, 1),
                   grid=(nBlocks, 1))
    context.synchronize()
    indexOut_host = indexOut_gpu.get()
    indexOut_gpu.gpudata.free()
    return indexOut_host
def bfs_sa(root, g):
    v = g.vertex_count
    e = g.edge_count
    beg_pos = numpy.asarray(g.beg_pos, dtype = numpy.long)

    csr = numpy.asarray(g.csr, dtype = numpy.long)

    weight = numpy.asarray(g.weight, dtype = numpy.float64)

    sa = numpy.zeros(v, dtype = numpy.float64)

    flag_traverse = numpy.ones(1, dtype = numpy.bool)

    print "v=" +  str(v) + ", e=" + str(e)
    traverse_one = mod.get_function("traverse_one")
    traverse_one(drv.In(beg_pos), drv.In(csr) ,drv.In(weight), drv.Out(sa), drv.In(flag_traverse), block = (v,1,1))
    context.synchronize()
    print sa
    printresult(sa)
示例#11
0
    if (i < numElements)
    {
        res[i] = 23.0 * a[i] + b[i];
    }
}
""")

func = mod.get_function("zaxpy")

# Warmup
func(a_gpu, b_gpu, res_gpu, np.int64(size), block=(16, 16, 1))
func(a_gpu, b_gpu, res_gpu, np.int64(size), block=(16, 16, 1))
func(a_gpu, b_gpu, res_gpu, np.int64(size), block=(16, 16, 1))

context.synchronize()

print(time.time())
start = time.time()
for i in range(10):
    func(a_gpu,
         b_gpu,
         res_gpu,
         np.int64(size),
         block=(16, 16, 1),
         grid=(16, 16, 1))
    context.synchronize()
end = time.time()

print(time.time())
示例#12
0
                                    devB2[:] = devB1

                                devC2    = ng.empty(dimC, dtype=np.float32)
                                # devC2    = devC2s.share(dimC, dtype=np.float32)
                                devC2[:] = devC1

                                if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
                                if op[1] == 't': devB1, devB2 = devB1.T, devB2.T

                                for tile in (32,64,128):
                                    if op == 'nt' and tile != 128:
                                        continue
                                    try: 

                                        ng.dot(devA1, devB1, devC1, alpha=alpha, beta=beta, size=tile)
                                        context.synchronize()

                                        cublas_dot(devA2, devB2, devC2, alpha=alpha, beta=beta)

                                        partial1 = ng.empty((devC1.shape[0],1), dtype=np.float32)
                                        partial2 = partial1[0:1,0:1]

                                        if ng.min(ng.finite(devC1), partial=partial1, out=partial2).get()[0,0] == 0.0:
                                            print("Error: NaN KCN: (%d,%d,%d) ab: (%f,%f) dtype: %d" %
                                                  (K,C,N, alpha,beta, itemsize))
                                            exit()

                                        diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0,0]
                                        mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0,0]
                                        pctErr = 100 * diff / mean
示例#13
0
    def _project(
        self,
        camera_projection: geo.CameraProjection,
    ) -> np.ndarray:
        """Perform the projection over just one image.

        Args:
            camera_projection (geo.CameraProjection): a camera projection transform.

        Raises:
            RuntimeError: if the projector has not been initialized.

        Returns:
            np.ndarray: the output projection for each material.
        """
        if not self.initialized:
            raise RuntimeError("Projector has not been initialized.")

        # initialize projection-specific arguments
        camera_center_in_volume = np.array(camera_projection.get_center_in_volume(self.volume)).astype(np.float32)
        logger.debug(f'camera_center_ijk (source point): {camera_center_in_volume}')

        ijk_from_index = camera_projection.get_ray_transform(self.volume)
        ijk_from_index = np.array(ijk_from_index).astype(np.float32)

        # spacing
        spacing = self.volume.spacing

        # copy the projection matrix to CUDA (output array initialized to zero by the kernel)
        cuda.memcpy_htod(self.rt_kinv_gpu, ijk_from_index)

        # Make the arguments to the CUDA "projectKernel".
        args = [
            np.int32(self.camera_intrinsics.sensor_width),          # out_width
            np.int32(self.camera_intrinsics.sensor_height),          # out_height
            np.float32(self.step),                  # step
            np.float32(-0.5),                       # gVolumeEdgeMinPointX
            np.float32(-0.5),                       # gVolumeEdgeMinPointY
            np.float32(-0.5),                       # gVolumeEdgeMinPointZ
            np.float32(self.volume.shape[0] - 0.5), # gVolumeEdgeMaxPointX
            np.float32(self.volume.shape[1] - 0.5), # gVolumeEdgeMaxPointY
            np.float32(self.volume.shape[2] - 0.5), # gVolumeEdgeMaxPointZ
            np.float32(spacing[0]),         # gVoxelElementSizeX
            np.float32(spacing[1]),         # gVoxelElementSizeY
            np.float32(spacing[2]),         # gVoxelElementSizeZ
            camera_center_in_volume[0],                        # sx
            camera_center_in_volume[1],                        # sy
            camera_center_in_volume[2],                        # sz
            self.rt_kinv_gpu,                       # RT_Kinv
            self.output_gpu,                        # output
        ]

        # Calculate required blocks
        blocks_w = np.int(np.ceil(self.sensor_size[0] / self.threads))
        blocks_h = np.int(np.ceil(self.sensor_size[1] / self.threads))
        block = (8, 8, 1)
        # lfkj("running:", blocks_w, "x", blocks_h, "blocks with ", self.threads, "x", self.threads, "threads")

        if blocks_w <= self.max_block_index and blocks_h <= self.max_block_index:
            offset_w = np.int32(0)
            offset_h = np.int32(0)
            self.project_kernel(*args, offset_w, offset_h, block=block, grid=(blocks_w, blocks_h))
        else:
            # lfkj("running kernel patchwise")
            for w in range((blocks_w - 1) // (self.max_block_index + 1)):
                for h in range((blocks_h - 1) // (self.max_block_index + 1)):
                    offset_w = np.int32(w * self.max_block_index)
                    offset_h = np.int32(h * self.max_block_index)
                    self.project_kernel(*args, offset_w, offset_h, block=block, grid=(self.max_block_index, self.max_block_index))
                    context.synchronize()
                
        # copy the output to CPU
        output = np.empty(self.output_shape, np.float32)
        cuda.memcpy_dtoh(output, self.output_gpu)

        # transpose the axes, which previously have width on the slow dimension
        output = np.swapaxes(output, 0, 1).copy()

        # normalize to centimeters
        output /= 10

        return output
    def minimize_batch(self, batch_size, eta, opt_method):
        #using a batch_gradient descent method
        cost = 0.0
        eps = 1e-8  # for use in adagrad
        for lower_index in range(0, len(self.nonzeros), batch_size):
            upper_index = min(lower_index + batch_size, len(self.nonzeros))  # index of first element after the end of this batch
            batch = [self.nonzeros[k] for k in range(lower_index, upper_index)]
            batch_i = [index[0] for index in batch]
            batch_j = [index[1] for index in batch]
            cur_batch_len = np.int32(upper_index - lower_index)
            
            batch_i_gpu = gpuarray.to_gpu(np.array(batch_i, dtype=np.int32))
            batch_j_gpu = gpuarray.to_gpu(np.array(batch_j, dtype=np.int32))
            cost_inner = gpuarray.zeros(batch_size, dtype=np.float32)
            weighted_cost_inner = gpuarray.zeros_like(cost_inner)
            
            # calculate intermediate values
            # cost_inner =  + self.b[batch_i] + \
            #     self.b_tilde[batch_j] - np.log(np.array([self.cooccurrence_mat[k] for k in range(lower_index, upper_index)]))
            batchMatColDot(cur_batch_len, self.v_dim, self.W, self.W_tilde, batch_i_gpu, batch_j_gpu, cost_inner, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
            context.synchronize()
            if lower_index == 0:
                print cost_inner.get()
            batchCostInner(np.int32(lower_index), np.int32(upper_index), cost_inner, self.b, self.b_tilde, \
                self.cooccurrence_mat, batch_i_gpu, batch_j_gpu, block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            if lower_index == 0:
                print cost_inner.get()
            context.synchronize()
            # weighted_cost_inner = np.array([self.f_x[k] for k in range(lower_index_upper_index)]) * cost_inner
            batchWeightedInnerCost(np.int32(lower_index), np.int32(upper_index), self.f_x, cost_inner, weighted_cost_inner, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            if lower_index == 0:
                print weighted_cost_inner.get()
            context.synchronize()
            
            # calculate the gradients of each parameter
            # self.gradW[batch_i] = (self.W_tilde[batch_j].T * weighted_cost_inner).T
            batchMatVecRowMult(cur_batch_len, self.v_dim, self.W_tilde, weighted_cost_inner, self.gradW, batch_j_gpu, batch_i_gpu, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
            # self.gradW_tilde[batch_j] = (self.W[batch_i].T * weighted_cost_inner).T
            batchMatVecRowMult(cur_batch_len, self.v_dim, self.W, weighted_cost_inner, self.gradW_tilde, batch_i_gpu, batch_j_gpu, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
                        
            # self.gradb[batch_i] = self.gradb_tilde[batch_j] = weighted_cost_inner
            batchCopyVector(cur_batch_len, weighted_cost_inner, self.b, batch_i_gpu, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            batchCopyVector(cur_batch_len, weighted_cost_inner, self.b_tilde, batch_j_gpu, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            context.synchronize()

            # perform the main parameter updates
            # self.W[batch_i] -= eta * self.gradW[batch_i]
            batchMatSubtractInplace(cur_batch_len, self.v_dim, eta, self.W, self.gradW, batch_i_gpu, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
            # self.W_tilde[batch_j] -= eta * self.gradW_tilde[batch_j]
            batchMatSubtractInplace(cur_batch_len, self.v_dim, eta, self.W_tilde, self.gradW_tilde, batch_j_gpu, \
                block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y))
            # self.b[batch_i] -= eta * self.gradb[batch_i]
            batchVecSubtractInplace(cur_batch_len, eta, self.b, self.gradb, batch_i_gpu, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))
            # self.b_tilde[batch_j] -= eta * self.gradb_tilde[batch_j]      
            batchVecSubtractInplace(cur_batch_len, eta, self.b_tilde, self.gradb_tilde, batch_j_gpu, \
                block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1))      
            context.synchronize()

            cost += gpuarray.dot(weighted_cost_inner, cost_inner).get()
        return cost
示例#15
0
    def project(self, proj_mat, threads=8, max_blockind=1024):
        if not self.initialized:
            print("Projector is not initialized")
            return

        inv_ar_mat, source_point = proj_mat.get_conanical_proj_matrix(
            voxel_size=self.voxelsize,
            volume_size=self.volumesize,
            origin_shift=self.origin)

        can_proj_matrix = inv_ar_mat.astype(np.float32)
        pixel_array = np.zeros(
            (self.proj_width, self.proj_height)).astype(np.float32)
        sourcex = source_point[0]
        sourcey = source_point[1]
        sourcez = source_point[2]
        g_volume_edge_min_point_x = np.float32(-0.5)
        g_volume_edge_min_point_y = np.float32(-0.5)
        g_volume_edge_min_point_z = np.float32(-0.5)
        g_volume_edge_max_point_x = np.float32(self.volumesize[0] - 0.5)
        g_volume_edge_max_point_y = np.float32(self.volumesize[1] - 0.5)
        g_volume_edge_max_point_z = np.float32(self.volumesize[2] - 0.5)
        g_voxel_element_size_x = self.voxelsize[0]
        g_voxel_element_size_y = self.voxelsize[1]
        g_voxel_element_size_z = self.voxelsize[2]

        #copy to gpu
        proj_matrix_gpu = cuda.mem_alloc(can_proj_matrix.nbytes)
        cuda.memcpy_htod(proj_matrix_gpu, can_proj_matrix)
        pixel_array_gpu = cuda.mem_alloc(pixel_array.nbytes)
        cuda.memcpy_htod(pixel_array_gpu, pixel_array)

        #calculate required blocks
        #threads = 8
        blocks_w = np.int(np.ceil(self.proj_width / threads))
        blocks_h = np.int(np.ceil(self.proj_height / threads))
        print("running:", blocks_w, "x", blocks_h, "blocks with ", threads,
              "x", threads, "threads")

        if blocks_w <= max_blockind and blocks_h <= max_blockind:
            #run kernel
            offset_w = np.int32(0)
            offset_h = np.int32(0)
            self.projKernel(self.proj_width,
                            self.proj_height,
                            self.stepsize,
                            g_volume_edge_min_point_x,
                            g_volume_edge_min_point_y,
                            g_volume_edge_min_point_z,
                            g_volume_edge_max_point_x,
                            g_volume_edge_max_point_y,
                            g_volume_edge_max_point_z,
                            g_voxel_element_size_x,
                            g_voxel_element_size_y,
                            g_voxel_element_size_z,
                            sourcex,
                            sourcey,
                            sourcez,
                            proj_matrix_gpu,
                            pixel_array_gpu,
                            offset_w,
                            offset_h,
                            block=(8, 8, 1),
                            grid=(blocks_w, blocks_h))
        else:
            print("running kernel patchwise")
            for w in range(0, (blocks_w - 1) // max_blockind + 1):
                for h in range(0, (blocks_h - 1) // max_blockind + 1):
                    offset_w = np.int32(w * max_blockind)
                    offset_h = np.int32(h * max_blockind)
                    # print(offset_w, offset_h)
                    self.projKernel(self.proj_width,
                                    self.proj_height,
                                    self.stepsize,
                                    g_volume_edge_min_point_x,
                                    g_volume_edge_min_point_y,
                                    g_volume_edge_min_point_z,
                                    g_volume_edge_max_point_x,
                                    g_volume_edge_max_point_y,
                                    g_volume_edge_max_point_z,
                                    g_voxel_element_size_x,
                                    g_voxel_element_size_y,
                                    g_voxel_element_size_z,
                                    sourcex,
                                    sourcey,
                                    sourcez,
                                    proj_matrix_gpu,
                                    pixel_array_gpu,
                                    offset_w,
                                    offset_h,
                                    block=(8, 8, 1),
                                    grid=(max_blockind, max_blockind))
                    context.synchronize()

        #context.synchronize()
        cuda.memcpy_dtoh(pixel_array, pixel_array_gpu)

        pixel_array = np.swapaxes(pixel_array, 0, 1)
        #normalize to cm
        return pixel_array / 10