示例#1
0
文件: myCudaModule.py 项目: adwaye/MS
def grayfication(image):
    forme = image.shape
    aSize = forme[0] * forme[1]
    xdim = np.int32(forme[0])
    ydim = np.int32(forme[1])
    r_img = image[:, :, 0].reshape(aSize, order='F')
    g_img = image[:, :, 1].reshape(aSize, order='F')
    b_img = image[:, :, 2].reshape(aSize, order='F')
    dest = np.zeros(aSize).astype(np.float32)

    #block size: B := dim1*dim2*dim3=1024
    #gird size : dim1*dimr2*dim3 = ceiling(aSize/B)
    blockX = int(xdim)
    multiplier = aSize / float(blockX)
    if (aSize / float(blockX) > int(aSize / float(blockX))):
        gridX = int(multiplier + 1)
    else:
        gridX = int(multiplier)

#parallel rgb computation+time
    rgb2gray(drv.Out(dest),
             drv.InOut(r_img),
             drv.InOut(g_img),
             drv.InOut(b_img),
             ydim,
             block=(blockX, 1, 1),
             grid=(gridX, 1, 1))

    dest = np.reshape(dest, forme[0:2], order='F')
    return dest
示例#2
0
  def _draw(self, pts, colors):
    if not pts:
      return False
    imsize = self.imsize

    dt0 = time()

    ind_count = zeros(self.imsize2, npint)
    colors = row_stack(colors).astype(npfloat)

    xy = vstack(pts).astype(npfloat)
    inds = zeros(xy.shape[0], npint)

    self.cuda_agg(npint(inds.shape[0]),
                  npint(imsize),
                  cuda.In(xy),
                  cuda.InOut(inds),
                  cuda.InOut(ind_count),
                  block=(THREADS, 1, 1),
                  grid=(int(inds.shape[0]//THREADS) + 1, 1))

    mask = inds > -1

    if not mask.any():
      print('-- no dots to draw. time: {:0.4f}'.format(time()-dt0))
      return False

    # xy = xy[mask, :]
    inds = inds[mask]
    colors = colors[mask]

    ind_count_map = _build_ind_count(ind_count)
    _ind_count_map = cuda.mem_alloc(ind_count_map.nbytes)
    cuda.memcpy_htod(_ind_count_map, ind_count_map)

    sort_colors = zeros((inds.shape[0], 4), npfloat)
    _sort_colors = cuda.mem_alloc(sort_colors.nbytes)
    cuda.memcpy_htod(_sort_colors, sort_colors)

    self.cuda_agg_bin(npint(inds.shape[0]),
                      _ind_count_map,
                      cuda.In(colors),
                      cuda.In(inds),
                      _sort_colors,
                      block=(THREADS, 1, 1),
                      grid=(int(inds.shape[0]//THREADS) + 1, 1))

    dotn, _ = ind_count_map.shape
    self.cuda_dot(npint(dotn),
                  self._img,
                  _ind_count_map,
                  _sort_colors,
                  block=(THREADS, 1, 1),
                  grid=(int(dotn//THREADS) + 1, 1))

    if self.verbose is not None:
      print('-- drew dots: {:d}. time: {:0.4f}'.format(colors.shape[0],
                                                       time()-dt0))
    self._updated = True
    return True
    def _do_cuda_calculation(self, pos0, vel0, sim_time, kernel_sim_time):
        time = 0
        counter = 1
        iterations = sim_time / kernel_sim_time

        while (time < sim_time):
            print "  Kernel execution step %s/%s..." % (counter, iterations),
            self._initalize_cuda()

            mod = SourceModule(self.gpu_source)
            do_basins = mod.get_function("basins")

            do_basins(cuda.InOut(pos0[0]),
                      cuda.InOut(pos0[1]),
                      cuda.InOut(vel0[0]),
                      cuda.InOut(vel0[1]),
                      cuda.InOut(self.track_length),
                      cuda.Out(self.result_data),
                      numpy.float32(kernel_sim_time),
                      block=(self.THREADS_PER_BLOCK, self.THREADS_PER_BLOCK,
                             1),
                      grid=(self.resolution / self.THREADS_PER_BLOCK,
                            self.resolution / self.THREADS_PER_BLOCK))

            self._deactivate_cuda()

            time = time + kernel_sim_time
            counter = counter + 1

            print "done"

        self._save_data()
示例#4
0
def f_gpu_observables_func(func, seed, num_trials, aS1, aS2, aEnergy,
                           photonYield, chargeYield, excitonToIonRatio,
                           g1Value, extractionEfficiency, gasGainValue,
                           gasGainWidth, speRes, intrinsicResS1,
                           intrinsicResS2):

    tArgs = [
        drv.In(seed),
        drv.In(num_trials),
        drv.InOut(aS1),
        drv.InOut(aS2),
        drv.In(aEnergy),
        drv.In(photonYield),
        drv.In(chargeYield),
        drv.In(excitonToIonRatio),
        drv.In(g1Value),
        drv.In(extractionEfficiency),
        drv.In(gasGainValue),
        drv.In(gasGainWidth),
        drv.In(speRes),
        drv.In(intrinsicResS1),
        drv.In(intrinsicResS2)
    ]

    func(*tArgs, grid=(2048, 1), block=(256, 1, 1))
示例#5
0
    def crt_multi_aug(self, Xt_to_t1_t, Phi_t1, Theta_t1, dtype='dense'):

        if dtype == 'dense':

            [K_t, J] = Xt_to_t1_t.shape
            K_t1 = Theta_t1.shape[0]
            N = K_t * J
            Para = np.array([K_t, K_t1, J, N], dtype=np.int32)

        Xt_to_t1_t = np.array(Xt_to_t1_t, dtype=np.int32, order='C')
        Xt_to_t1_t1 = np.zeros([K_t1, J], dtype=np.float32, order='C')
        WSZS_t1 = np.zeros([K_t, K_t1], dtype=np.float32, order='C')
        Phi_t1 = np.array(Phi_t1, dtype=np.float32, order='C')
        Theta_t1 = np.array(Theta_t1, dtype=np.float32, order='C')

        if N != 0:

            block_x = int(400)
            grid_x = int(np.floor(N / block_x) + 1)

            randomseed = np.random.rand(N)
            randomseed = np.array(randomseed, dtype=np.float32, order='C')

            func = mod.get_function('Crt_Multi_Sampler')
            func(drv.In(randomseed),
                 drv.In(Para),
                 drv.In(Xt_to_t1_t),
                 drv.In(Phi_t1),
                 drv.In(Theta_t1),
                 drv.InOut(WSZS_t1),
                 drv.InOut(Xt_to_t1_t1),
                 grid=(grid_x, 1),
                 block=(block_x, 1, 1))

        return Xt_to_t1_t1, WSZS_t1
示例#6
0
def main():

	#a = numpy.matrix('2 -1 -4 ; 4 1 -2; 6 3 0').astype(numpy.float32)
	a = numpy.random.rand(SIZE_OF_MATRIX, SIZE_OF_MATRIX).astype(numpy.float32)
	#print a
	b = a
	c = numpy.zeros((SIZE_OF_MATRIX,SIZE_OF_MATRIX), dtype=numpy.float32)
	lda = numpy.int32(SIZE_OF_MATRIX)

	d_a = cuda.mem_alloc(a.nbytes)
	d_b = cuda.mem_alloc(b.nbytes)
	d_c = cuda.mem_alloc(c.nbytes)

	cuda.memcpy_htod(d_a, a)
	cuda.memcpy_htod(d_b, b)

	print "threads:", number_of_threads, "blocks: ", number_of_blocks

	multiply_matrices = multiply_source.get_function("multiply_matrices")
	multiply_matrices_shared_blocks = multiply_source.get_function("multiply_matrices_shared_blocks")
	
	multiply_matrices(d_a, d_b, cuda.InOut(c), lda,
										block=(number_of_threads,number_of_threads,1),
										grid=(number_of_blocks,number_of_blocks))	

	
	pycuda.driver.Context.synchronize()
	
	multiply_matrices_shared_blocks(d_a, d_b, cuda.InOut(c), lda,
										block=(number_of_threads,number_of_threads,1),
										grid=(number_of_blocks,number_of_blocks))	

	
	pycuda.driver.Context.synchronize()
示例#7
0
def do_KMP(text, pattern, pm_table):
    start = cuda.Event()
    end = cuda.Event()
    KMP = mod.get_function("KMP")
    text = np.array(text)
    result = np.zeros(text.size * 2 + 1, dtype=np.uint8)
    result[:] = 35  # 35 == #
    result_counter = np.array(0, dtype=np.int32)

    block = (THREADS, 1, 1)
    grid = (int((text.size / pattern.size + THREADS - 1) / THREADS), 1)

    n = pattern.size
    n = np.array(n, dtype=np.int32)

    m = text.size
    m = np.array(m, dtype=np.int32)
    start.record()
    KMP(cuda.In(pattern),
        cuda.In(text),
        cuda.In(pm_table),
        cuda.InOut(result),
        cuda.In(n),
        cuda.In(m),
        cuda.InOut(result_counter),
        block=block,
        grid=grid)
    end.record()
    end.synchronize()

    # print("Time: {}ms".format(start.time_till(end)))

    return (result_counter.item(0), result)
示例#8
0
def process_output_gpu(source_package, dataset, index, item):
    ill_map_ldr = source_package['ill_map_ldr']
    ill_map_hdr = source_package['ill_map_hdr']

    # Make cubemap canvas for LDR and HDR
    cubemap_xyz_flt, idx, cubemap_idx, cubemap_weight, cubemap_basis, shc_norm = get_cube_idx(
        ill_map_ldr.shape[1], ill_map_ldr.shape[0])
    cubemap_len = cubemap_xyz_flt.shape[0]

    ill_map_ldr_2d = ill_map_ldr.reshape((-1, 3))
    ill_map_hdr_2d = ill_map_hdr.reshape((-1, 3))

    cubemap_color_ldr = np.empty((cubemap_len, 3), dtype=np.float32)
    cubemap_color_hdr = np.empty((cubemap_len, 3), dtype=np.float32)

    cubemap_color_ldr[idx, :] = ill_map_ldr_2d[cubemap_idx, :]
    cubemap_color_hdr[idx, :] = ill_map_hdr_2d[cubemap_idx, :]

    # LDR Image need to convert to linear color space
    srgb_to_linear(cubemap_color_ldr)

    # Debug point for dumping cubemap to point cloud
    if DEBUG:
        cubemap_pc = np.concatenate((cubemap_xyz_flt, cubemap_color_ldr),
                                    axis=-1)
        np.save(f'{OUTPUT_PATH}/{dataset}/{index}/cubemap_gpu', cubemap_pc)

    # Calculate the SH coefficients
    cubemap_clr_ldr = cubemap_color_ldr * cubemap_weight
    cubemap_clr_hdr = cubemap_color_hdr * cubemap_weight

    cubemap_clr_ldr = cubemap_clr_ldr.astype(np.float32)
    cubemap_clr_hdr = cubemap_clr_hdr.astype(np.float32)

    len_pixels = cubemap_len // 6
    shc_hdr = np.zeros((9, 3), dtype=np.float64)
    shc_ldr = np.zeros((9, 3), dtype=np.float64)

    make_sh_coefficients(drv.InOut(shc_ldr),
                         drv.InOut(shc_hdr),
                         drv.In(cubemap_basis),
                         drv.In(cubemap_clr_ldr),
                         drv.In(cubemap_clr_hdr),
                         grid=(6, (len_pixels + 1024 - 1) // 1024, 1),
                         block=(1, 1024, 1))

    # normalize
    shc_ldr = (shc_ldr * shc_norm).reshape(-1).astype(np.float32)
    shc_hdr = (shc_hdr * shc_norm).reshape(-1).astype(np.float32)

    f = open(f'{OUTPUT_PATH}/{dataset}/{index}/shc_ldr.json', 'w')
    f.write(json.dumps(shc_ldr.tolist()))
    f.close()

    f = open(f'{OUTPUT_PATH}/{dataset}/{index}/shc_hdr.json', 'w')
    f.write(json.dumps(shc_hdr.tolist()))
    f.close()
示例#9
0
    def alg1(self):
        '''
            Implementation of the first Z-value algorithm. At the end of this implementation, the hessian function is called internally.

            :return: The array of betahats for each reference spike train as well as the confidence interval corresponding to each betahat value.

            Main internal variables:

            * mod_z1: The CUDA kernel of the first algorithm.
            * t1 (in kernel): Backward recurrence time.
        '''
        self.mod_z1 = SourceModule("""
                #include <stdio.h>
                #include <math.h>
                __global__ void z_function(float *tspamt, float *a, float *isiat, float *tspz,  float *z, long p , int maxi,float gm, float alphas, float alphar)
                {
                int m = threadIdx.x ;
                int i = blockIdx.y;
                int j = blockIdx.x;
                if (i>=j)
                    {
                    float t1;
                    int temp = a[m*gridDim.y+i];
                    int temp2 = a[m*gridDim.y+j];
                    int index = 0 ;
                    t1 = tspamt [m*gridDim.y+temp] - isiat [m*gridDim.y+temp] + isiat [m*gridDim.y+temp2] ;
                    for (int k = m; k < p*maxi ;k+=p)
                    {
                            if (tspz [k] < t1 && tspz [k] != -1 && index < k)
                        {
                           index= k ;
                         }
                    }
                    float bwt;
                    bwt = t1 - tspz [index];
                    z[gridDim.y*gridDim.y*m + gridDim.y*i + j] = (1/gm)*((exp(-bwt/alphas)-exp(-bwt/alphar))/(alphas-alphar));
              }
                }
                """)  # The CUDA kernel of the first algorithm.

        z1_func = self.mod_z1.get_function("z_function")
        z1_func(cuda.InOut(self.tspamt_d),
                cuda.InOut(self.a_d),
                cuda.InOut(self.isiat_d),
                cuda.InOut(self.tspz),
                cuda.InOut(self.z),
                int_(self.p),
                int_(self.maxi_d),
                float32(self.gm),
                float32(self.alphas),
                float32(self.alphar),
                block=(self.p, 1, 1),
                grid=(int_(self.laf), int_(self.laf)))
        return self.hessian()
示例#10
0
    def update_all_individuals(cls, dt):
        grid_x = (cls._next_id + BLOCK_SIZE - 1) // BLOCK_SIZE

        _update_individuals_fn(
            numpy.uint32(cls._next_id),  # unsigned int count
            numpy.float32(dt),  # float dt
            cuda.InOut(cls.Cuda_Arrays._age),  # float* age
            cuda.InOut(cls.Cuda_Arrays._alive),  # unsigned int* alive
            cuda.In(cls.Cuda_Arrays._death_age),  # float* death_age
            block=(BLOCK_SIZE, 1, 1),
            grid=(grid_x, 1),
            time_kernel=True)
示例#11
0
 def cudastep(self):
     self.cstep(drv.InOut(self.pos[0]),
                drv.InOut(self.pos[1]),
                drv.InOut(self.v[0]),
                drv.InOut(self.v[1]),
                self.N,
                self.size,
                self.epsilon,
                self.width,
                self.height,
                block=(self.blocksize, 1, 1),
                grid=(self.gridsize, 1))
    def compute_new_pendulum_states_rk4(self, currentStates,
                                        numTimeStepsTillFlipData,
                                        numTimeStepsAlreadyExecuted,
                                        maxTimeStepsToExecute,
                                        startFromDefaultState):
        logger.info(
            'Computing new pendulum states with Runge-Kutta 4th order method')
        logger.info('time step: ' + str(self.timeStep) + ' seconds')
        logger.info('amount of time already computed: ' +
                    str(numTimeStepsAlreadyExecuted * self.timeStep) +
                    ' seconds')
        logger.info('max time to see if pendulum flips: ' +
                    str(maxTimeStepsToExecute * self.timeStep) + ' seconds')
        logger.info('amount of time to simulate: ' +
                    str((maxTimeStepsToExecute - numTimeStepsAlreadyExecuted) *
                        self.timeStep) + ' seconds')

        # Compute the double pendulum fractal image.
        logger.info('Running pendulum simulation kernel...')
        kernelStart = time.time()

        self.computeDoublePendulumFractalFromInitialStatesRK4Function(
            self.npFloatType(self.point1Mass),
            self.npFloatType(self.point2Mass),
            self.npFloatType(self.pendulum1Length),
            self.npFloatType(self.pendulum2Length),
            self.npFloatType(self.gravity),
            self.npFloatType(self.angle1Min),
            self.npFloatType(self.angle1Max),
            self.npFloatType(self.angle2Min),
            self.npFloatType(self.angle2Max),
            cuda.InOut(currentStates),
            np.int32(startFromDefaultState),
            np.int32(numTimeStepsAlreadyExecuted),
            np.int32(self.numberOfAnglesToTestX),
            np.int32(self.numberOfAnglesToTestY),
            self.npFloatType(self.timeStep),
            np.int32(maxTimeStepsToExecute),
            cuda.InOut(numTimeStepsTillFlipData),
            # block=(1, 1, 1), grid=(1, 1))
            # block=(2, 2, 1), grid=(1, 1))
            # block=(4, 4, 1), grid=(4, 4))
            # block=(8, 8, 1), grid=(8, 8))
            block=(16, 16, 1),
            grid=(16, 16))
        # block=(32, 32, 1), grid=(32, 32))

        # Print the time it took to run the kernel.
        timeToExecuteLastKernel = time.time() - kernelStart
        logger.info('Completed pendulum simulation kernel in ' +
                    str(timeToExecuteLastKernel) + ' seconds')
示例#13
0
def test():
    mask = cv2.imread('cur_mask.jpg')
    mask = mask[:, :, 0]
    # mask_g = mask[:,:,1]
    # mask_b = mask[:,:,2]

    x = []
    y = []
    prev_time = time.time()
    for i in range(len(mask)):
        for j in range(len(mask[0])):
            if mask[i][j] == 2:
                print('x = %d,y = %d' % (j, i))
                y.append(i)
                x.append(j)
    logger.info('process frame time:' + str(time.time() - prev_time))
    prev_time = time.time()
    x1 = min(x)
    y1 = min(y)
    x2 = max(x)
    y2 = max(y)
    logger.info('post process frame time:' + str(time.time() - prev_time))
    print(x1, y1, x2, y2)

    # w = numpy.int32(len(mask[0]))
    # h = numpy.int32(len(mask))
    w = numpy.int64(len(mask[0]))
    h = numpy.int64(len(mask))
    mask_np = numpy.array(mask)
    mask_np = mask_np.reshape(-1).astype(float)
    N = len(mask_np)
    # print(N)
    # print(w)
    # print(h)
    a = numpy.zeros(N, dtype=numpy.float)
    b = numpy.zeros(N, dtype=numpy.float)
    nTheads = 1024
    nBlocks = int((N + nTheads - 1) / nTheads)
    print("nBlocks:%d\n" % nBlocks)
    prev_time = time.time()
    func(drv.In(mask_np),
         drv.InOut(a),
         drv.InOut(b),
         w,
         h,
         block=(nTheads, 1, 1),
         grid=(nBlocks, ))
    logger.info('gpu process frame time:' + str(time.time() - prev_time))
    print(max(a))
    print(max(b))
    print(a)
示例#14
0
def simulate_positions(module, Nobs, N, bounds, radius, d, dN, pa, ps, seed=666, Nthreads=64):
    
    Nphotons = Nobs*N
    print "Total Threads: %s" % Nphotons
    assert(Nphotons <= 1.1e8)

    d = np.uint32(d)
    dN = np.uint32(dN)
    radius = np.float32(radius)
    Ndoms = np.uint32(pow((d/dN)*2+1, 3) - 1)

    t1 = time.time()
    rng_states = get_rng_states(module, Nphotons, seed=seed)
    t2 = time.time()

    d_list = get_doms(module, Ndoms, radius, d, dN)
    t3 = time.time()

    x = np.random.uniform(bounds[0][0], bounds[0][1], N)
    y = np.random.uniform(bounds[1][0], bounds[1][1], N)
    z = np.random.uniform(bounds[2][0], bounds[2][1], N)
    # print x
    # print y
    # print z
    pInit = np.concatenate([x, y, z]).astype(np.float32)
    t4 = time.time()
    
    # print "t2-t1: ", t2-t1
    # print "t3-t2: ", t3-t2
    # print "t4-t3: ", t4-t3

    start = time.time()
    datahits = np.zeros(Ndoms*N, dtype=np.int32)
    datahitsNum = -np.ones(Nobs*N, dtype=np.int32)
    datatimes = np.zeros(Nphotons, dtype=np.float32)
    datapositions = np.zeros(Nphotons*3, dtype=np.float32)
    simulate = module.get_function('simulate_positions')

    simulate(np.uint64(Nphotons), np.uint64(Nobs), rng_states, d_list, cuda.InOut(datahits), cuda.InOut(datatimes), cuda.InOut(datahitsNum), 
             cuda.InOut(datapositions), cuda.In(pInit), np.float32(pa), np.float32(ps),np.uint32(Ndoms), 
             block=(Nthreads, 1, 1), grid=(Nphotons//Nthreads + 1, 1))    

    print "end-start", time.time() - start
    # print "sumHits: ", sum(datahits)
    # print 

    datahits = np.reshape(np.array(datahits, dtype=float), (N, Ndoms))
    datahitsNum = np.reshape(np.array(datahitsNum, dtype=float), (N, Nobs))
    datatimes = np.reshape(np.array(datatimes, dtype=float), (N, Nobs))
    pInit = np.reshape(pInit, (3, N)).T
    return datahits, datahitsNum, datatimes, pInit
    def compute_new_pendulum_states_time_till_flip_adaptive_step_size_method(
            self, currentStates, timeTillFlipData, timeAlreadyExecuted,
            maxTimeToExecute, startFromDefaultState):
        logger.info('Computing new pendulum states with ' +
                    str(self.algorithm.name) + ' method')
        logger.info('Using the "time till flip" kernel')
        logger.info('time step: ' + str(self.timeStep) + ' seconds')
        logger.info('error tolerance: ' + str(self.errorTolerance))
        logger.info('amount of time already computed: ' +
                    str(timeAlreadyExecuted) + ' seconds')
        logger.info('max time to see if pendulum flips: ' +
                    str(maxTimeToExecute) + ' seconds')
        logger.info('amount of time to simulate: ' +
                    str(maxTimeToExecute - timeAlreadyExecuted) + ' seconds')

        # Compute the double pendulum fractal image.
        logger.info('Running pendulum simulation kernel...')
        kernelStart = time.time()

        self.computeDoublePendulumFractalWithTimeTillFlipMethodAndAdaptiveStepSize(
            self.npFloatType(self.point1Mass),
            self.npFloatType(self.point2Mass),
            self.npFloatType(self.pendulum1Length),
            self.npFloatType(self.pendulum2Length),
            self.npFloatType(self.gravity),
            self.npFloatType(self.angle1Min),
            self.npFloatType(self.angle1Max),
            self.npFloatType(self.angle2Min),
            self.npFloatType(self.angle2Max),
            cuda.InOut(currentStates),
            np.int32(startFromDefaultState),
            self.npFloatType(timeAlreadyExecuted),
            np.int32(self.numberOfAnglesToTestX),
            np.int32(self.numberOfAnglesToTestY),
            self.npFloatType(self.timeStep),
            self.npFloatType(self.errorTolerance),
            self.npFloatType(maxTimeToExecute),
            cuda.InOut(timeTillFlipData),
            # block=(1, 1, 1), grid=(1, 1))
            # block=(2, 2, 1), grid=(1, 1))
            # block=(4, 4, 1), grid=(4, 4))
            # block=(8, 8, 1), grid=(8, 8))
            block=(16, 16, 1),
            grid=(16, 16))
        # block=(32, 32, 1), grid=(32, 32))

        # Print the time it took to run the kernel.
        timeToExecuteLastKernel = time.time() - kernelStart
        logger.info('Completed pendulum simulation kernel in ' +
                    str(timeToExecuteLastKernel) + ' seconds')
示例#16
0
def simulate_grid(module, Nobs, N, oversampling, datahits, datatimes, radius, d, dN, pa, ps, seed=666, Nthreads=64):
    
    Nruns = N*N
    Nphotons = Nobs*Nruns*oversampling
    Nobs *= oversampling
    print "Total Threads: %s" % Nphotons
    assert(Nphotons <= 1.1e8)

    d = np.uint32(d)
    dN = np.uint32(dN)
    radius = np.float32(radius)
    Ndoms = np.uint32(pow((d/dN)*2+1, 3) - 1)

    t1 = time.time()
    rng_states = get_rng_states(module, Nphotons, seed=seed)
    t2 = time.time()

    d_list = get_doms(module, Ndoms, radius, d, dN)
    t3 = time.time()
    x = np.linspace(-20, 20, N)
    y = np.linspace(-20, 20, N)
    X, Y = np.meshgrid(x, y)
    Z = np.zeros(N*N)
    pInit = np.concatenate([X.flatten(), Y.flatten(), Z]).astype(np.float32)
    t4 = time.time()
    
    print "t2-t1: ", t2-t1
    print "t3-t2: ", t3-t2
    print "t4-t3: ", t4-t3

    start = time.time()
    datahits = np.zeros(Ndoms*Nruns, dtype=np.int32)
    datatimesbinned = np.zeros(Ndoms*Nruns, dtype=np.float32)
    datatimes = np.zeros(Nphotons, dtype=np.float32)
    datapositions = np.zeros(Nphotons*3, dtype=np.float32)
    simulate = module.get_function('simulate_grid')

    simulate(np.uint64(Nphotons), np.uint64(Nobs), rng_states, d_list, cuda.InOut(datahits), cuda.InOut(datatimes), cuda.InOut(datatimesbinned), 
             cuda.InOut(datapositions), cuda.In(pInit), np.float32(pa), np.float32(ps),np.uint32(Ndoms), 
             block=(Nthreads, 1, 1), grid=(Nphotons//Nthreads + 1, 1))    

    print "end-start", time.time() - start
    print "sumHits: ", sum(datahits)
    print 

    datahits = np.reshape(np.array(datahits, dtype=float), (Nruns, Ndoms))/oversampling
    datatimesbinned = np.reshape(np.array(datatimesbinned, dtype=float), (Nruns, Ndoms))/oversampling

    return datahits, np.array(datatimes, dtype=float), datatimesbinned, datapositions
示例#17
0
def go(scale, block, test_cpu):
    data = np.fromstring(np.random.bytes(scale * block), dtype=np.uint8)
    print 'Done seeding'

    if test_cpu:
        a = time.time()
        cpu_pfxs = np.array([np.sum(data == v) for v in range(256)])
        b = time.time()
        print cpu_pfxs
        print 'took %g secs on CPU' % (b - a)

    shmem_pfxs = np.zeros(256, dtype=np.int32)
    launch('prefix_scan_8_0_shmem',
           cuda.In(data),
           np.int32(block),
           cuda.InOut(shmem_pfxs),
           block=(32, 16, 1),
           grid=(scale, 1),
           l1=1)
    if test_cpu:
        print 'it worked? %s' % (np.all(shmem_pfxs == cpu_pfxs))

    shmeml_pfxs = np.zeros(256, dtype=np.int32)
    launch('prefix_scan_8_0_shmem_lessconf',
           cuda.In(data),
           np.int32(block),
           cuda.InOut(shmeml_pfxs),
           block=(32, 32, 1),
           grid=(scale, 1),
           l1=1)
    print 'it worked? %s' % (np.all(shmeml_pfxs == shmem_pfxs))

    popc_pfxs = np.zeros(256, dtype=np.int32)
    launch('prefix_scan_8_0_popc',
           cuda.In(data),
           np.int32(block),
           cuda.InOut(popc_pfxs),
           block=(32, 16, 1),
           grid=(scale, 1),
           l1=1)

    popc5_pfxs = np.zeros(32, dtype=np.int32)
    launch('prefix_scan_5_0_popc',
           cuda.In(data),
           np.int32(block),
           cuda.InOut(popc5_pfxs),
           block=(32, 16, 1),
           grid=(scale, 1),
           l1=1)
示例#18
0
def testfloat3subequal():
    dest = np.copy(a)
    float3subequal(cuda.InOut(dest), cuda.In(b), **size)
    if not np.allclose(a['x']-b['x'], dest['x']) or \
            not np.allclose(a['y']-b['y'], dest['y']) or \
            not np.allclose(a['z']-b['z'], dest['z']):
        assert False
示例#19
0
def testfloat3divfloatequal():
    dest = np.copy(a)
    float3divfloatequal(cuda.InOut(dest), c, **size)
    if not np.allclose(a['x']/c, dest['x']) or \
            not np.allclose(a['y']/c, dest['y']) or \
            not np.allclose(a['z']/c, dest['z']):
        assert False
示例#20
0
def Crt_Matrix_GPU(Xt_to_t1_t, p, dtype='dense'):

    if dtype == 'dense':

        [K_t, J] = Xt_to_t1_t.shape
        N = K_t * J
        N = np.array(N, dtype=np.int32, order='C')

    Xt_to_t1_t = np.array(Xt_to_t1_t, dtype=np.int32, order='C')
    p = np.array(p, dtype=np.float32, order='C')
    X_t1 = np.zeros([K_t, J], dtype=np.float32, order='C')

    if N != 0:

        block_x = int(400)
        grid_x = int(np.floor(N / block_x) + 1)

        randomseed = np.random.rand(N)
        randomseed = np.array(randomseed, dtype=np.float32, order='C')
        func = mod.get_function('Crt_Sampler')
        func(drv.In(randomseed),
             drv.In(N),
             drv.In(Xt_to_t1_t),
             drv.In(p),
             drv.InOut(X_t1),
             grid=(grid_x, 1, 1),
             block=(block_x, 1, 1))

    return X_t1
示例#21
0
def test(N):
    # N = 1024 * 1024 * 90   # float: 4M = 1024 * 1024

    print("N = %d" % N)

    N = np.int32(N)

    a = np.random.randn(N).astype(np.float32)
    b = np.random.randn(N).astype(np.float32)
    # copy a to aa
    aa = np.empty_like(a)
    aa[:] = a
    # GPU run
    nTheads = 256
    nBlocks = int( ( N + nTheads - 1 ) / nTheads )
    start = timer()
    func(
        drv.InOut(a), drv.In(b), N,
        block=( nTheads, 1, 1 ), grid=( nBlocks, 1 ) )
    run_time = timer() - start
    print("gpu run time %f seconds " % run_time)
    # cpu run
    start = timer()
    aa = (aa * 10 + 2 ) * ((b + 2) * 10 - 5 ) * 5
    run_time = timer() - start

    print("cpu run time %f seconds " % run_time)

    # check result
    r = a - aa
    print( min(r), max(r) )
示例#22
0
def matAdd(A, B, alpha, beta):
    forme1 = A.shape
    forme2 = B.shape
    if (forme1 != forme2):
        sys.exit('matrix dimensions differ')

    aSize = forme1[0] * forme1[1]
    xdim = np.int32(forme1[0])
    ydim = np.int32(forme1[1])

    A = np.reshape(A, aSize, order='F').astype(np.float32)
    B = np.reshape(B, aSize, order='F').astype(np.float32)
    alpha = np.float32(alpha)
    beta = np.float32(beta)

    blockX = int(ydim)
    gridX = int(xdim)

    matrixAddition(drv.InOut(A),
                   drv.In(B),
                   alpha,
                   beta,
                   ydim,
                   block=(blockX, 1, 1),
                   grid=(gridX, 1, 1))
    A = np.reshape(A, forme1, order='F')

    return A
示例#23
0
def full_scan():
    # TODO: testing how slow a single full scan is with no parallelism
    sequential = SourceModule("""
        #include <stdio.h>
        __global__ void full_scan(unsigned char *img, int line[2])
        {
            int counter = 0;
            for(int y=0; y<853; y++) {
                for(int x=0; x<1918; x++) {
                    if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180)
                    && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) {
                        counter++;
                        if(counter == 50) {
                            line[0] = x;
                            line[1] = y;
                            return;
                        }
                    } else { counter = 0; }
                }
            }

        }
        """)

    image = cv.imread("test images/crop2.png")
    seq = sequential.get_function("full_scan")
    image_gpu = gpuarray.to_gpu_async(image)
    line = np.array([0, 0])
    timer = time.clock()
    seq(image_gpu, cuda.InOut(line), block=(1, 1, 1))
    print(time.clock() - timer)
    print(line)
示例#24
0
def kmeans(matrix, k, maxIterations):
    centroids = initCentroids(matrix, k)
    oldCentroids = None
    iterations = 0
    matrix_gpu = cuda.mem_alloc(matrix.nbytes)
    cuda.memcpy_htod(matrix_gpu, matrix)
    while (not numpy.array_equal(centroids,
                                 oldCentroids)) and iterations < maxIterations:
        oldCentroids = centroids
        centroids = numpy.ascontiguousarray(centroids, dtype=numpy.float32)
        labels = numpy.ascontiguousarray(numpy.empty((matrix.shape[0], 2)),
                                         dtype=numpy.int32)
        func = mod.get_function("getLabels")
        func(matrix_gpu,
             cuda.In(centroids),
             cuda.InOut(labels),
             numpy.int32(matrix.shape[1]),
             numpy.int32(matrix.shape[0]),
             numpy.int32(k),
             grid=(6, 10, 1),
             block=(32, 32, 1))
        centroids = getCentroids(matrix, centroids, labels)
        iterations += 1
    print(iterations)
    return labels
示例#25
0
def gpu_process(image, histogram):
    process(cuda.In(image),
            np.int32(image.size / 4),
            cuda.InOut(histogram),
            block=(THREADS_PER_BLOCK, 1, 1),
            grid=(10, 1))
    return histogram
示例#26
0
文件: myCudaModule.py 项目: adwaye/MS
def threshold2(image1, image2, minimum, maximum):
    forme1 = image1.shape
    forme2 = image2.shape
    if (np.size(forme1) > 2 & np.size(forme2) > 2):
        sys.exit('Only works on gray images')

    aSize = forme1[0] * forme1[1]
    xdim = np.int32(forme1[0])
    ydim = np.int32(forme1[1])
    dest = np.zeros(aSize).astype(np.float32)
    image2 = image2.reshape(aSize, order='F')

    minval = np.float32(minimum)
    maxval = np.float32(maximum)

    #block size: B := dim1*dim2*dim3=1024
    #gird size : dim1*dimr2*dim3 = ceiling(aSize/B)
    blockX = int(1024)
    multiplier = aSize / float(1024)
    if (aSize / float(1024) > int(aSize / float(1024))):
        gridX = int(multiplier + 1)
    else:
        gridX = int(multiplier)
#parallel rgb computation+time
    GPUthresholding2(drv.InOut(dest),
                     drv.In(image2),
                     ydim,
                     minima,
                     maxima,
                     block=(blockX, 1, 1),
                     grid=(gridX, 1, 1))

    dest = np.reshape(dest, forme1[0:2], order='F')
    return dest
示例#27
0
def cuda_nms(modules, boxes, scores, yxhw=False):
    if not yxhw:
        boxes = to_yxhw(boxes)
    
    #Prepare data for nms on GPU
    #After this,
    # boxes becomes: [y1,x1,y2,x2,score]
    # results becomes: [True,...]
    count = boxes.shape[0]
    boxes = np.hstack((boxes, np.expand_dims(scores,axis=1)))
    results = np.array([True]*count, dtype=np.bool)
    
    # Perform nms on GPU
    count = boxes.shape[0]
    NMS_GPU = modules.get_function("NMS_GPU")
    #use drv.InOut instead of drv.Out so the value of results can be passed in
    
    #Setting1:works only when count<=1024
    #grid_size, block_size = (1,count,1), (count,1,1)
    
    #Setting2:works when count>1024
    #grid_size, block_size = (count,count,1), (1,1,1)
    
    #Setting3:works when count>1024, faster then Setting2
    block_len = 32
    grid_len = math.ceil(count/block_len)
    grid_size, block_size = (grid_len,grid_len,1), (block_len,block_len,1)
    
    NMS_GPU(drv.In(boxes), drv.InOut(results),
            grid=grid_size, block=block_size)
    return list(np.where(results)[0])
示例#28
0
文件: cuda_nms.py 项目: junyuc1/YOLO
def cuda_nms(modules, boxes, scores, yxhw=False):
    if not yxhw:
        boxes = to_yxhw(boxes)

    n_boxes = boxes.shape[0]
    boxes = np.hstack((boxes, np.expand_dims(scores, axis=1)))
    results = np.array([True] * n_boxes, dtype=np.bool)

    # Perform nms on GPU
    NMS_GPU = modules.get_function("NMS_GPU")
    #use drv.InOut instead of drv.Out so the value of results can be passed in

    #Setting1:works only when count<=1024
    #grid_size, block_size = (1,n_boxes,1), (n_boxes,1,1)

    #Setting3:works when count>1024, faster then Setting2
    thread_per_block_dim = 32
    grid_len = math.ceil(n_boxes / thread_per_block_dim)
    grid_size, block_size = (grid_len, grid_len, 1), (thread_per_block_dim,
                                                      thread_per_block_dim, 1)

    NMS_GPU(drv.In(boxes),
            drv.InOut(results),
            grid=grid_size,
            block=block_size)
    return list(np.where(results)[0])
示例#29
0
def run(N_STATES=4):
    np.random.seed(42)
    fwdlattice = np.random.rand(N1+N2, N_STATES).astype(np.float32)
    bwdlattice = np.random.rand(N1+N2, N_STATES).astype(np.float32)
    framelogprob = np.random.rand(N1+N2, N_STATES).astype(np.float32)
    log_transmat = np.random.rand(N_STATES, N_STATES).astype(np.float32)
    sequence_lengths = np.array([N1, N2], dtype=np.int32)
    cum_sequence_lengths = np.array([0, N1], dtype=np.int32)
    transcounts = np.zeros((N_STATES, N_STATES), dtype=np.float32)
    n_trajs = 1

    f = mod.get_function('transitioncounts%d' % N_STATES)
    f(cuda.In(fwdlattice), cuda.In(bwdlattice), cuda.In(log_transmat),
      cuda.In(framelogprob), cuda.In(sequence_lengths), cuda.In(cum_sequence_lengths), np.int32(n_trajs),
      cuda.InOut(transcounts), grid=(1,1), block=(256,1,1))

    print 'cuda transcounts'
    print transcounts

    t2_1 = transitioncounts(fwdlattice[:N1], bwdlattice[:N1], framelogprob[:N1], log_transmat)
    #t2_2 = transitioncounts(fwdlattice[N1:], bwdlattice[N1:], framelogprob[N1:], log_transmat)
    print 'reference'
    print t2_1
    ref = t2_1

    print 'error N_STATES=%d: %f' % (N_STATES, np.linalg.norm(transcounts-ref))
示例#30
0
def test():
    func = SourceModule(source).get_function('log_diag_mvn_likelihood')
    n_samples = 8
    n_states = 9
    n_features = 33
    np.random.seed(42)
    sequences = np.random.rand(n_samples, n_features).astype(np.float32)
    means = np.random.rand(n_states, n_features).astype(np.float32)
    variances = np.random.rand(n_states, n_features).astype(np.float32)
    loglikelihoods = np.zeros((n_samples, n_states), dtype=np.float32)

    func(cuda.In(sequences),
         cuda.In(means),
         cuda.In(variances),
         cuda.In(np.log(variances)),
         np.int32(n_samples),
         np.int32(n_states),
         np.int32(n_features),
         cuda.InOut(loglikelihoods),
         block=(64, 1, 1),
         grid=(1, 1))

    print 'loglikelihoods'
    print loglikelihoods

    print 'sklearn'
    from sklearn.mixture.gmm import _log_multivariate_normal_density_diag
    r = _log_multivariate_normal_density_diag(sequences, means, variances)
    print r
    print np.abs(r - loglikelihoods) < 1e-4