Пример #1
0
def main():
    if len(sys.argv) < 3:
        print "INVALID USAGE: Missing arguments"
    elif len(sys.argv) == 3:
        # Gather information for GPU
        file = open(sys.argv[1]).readlines()
        for i in range(0, len(file)):
            file[i] = file[i].replace("\n", "").split(" ")
        trajectories = numpy.array([int(file[0][0])]).astype(numpy.float32)
        num_stops = numpy.array([int(file[0][1])]).astype(numpy.float32)
        stops = numpy.array(file[1:]).astype(numpy.float32)
        dest = numpy.zeros_like(stops)

        # Create callable python function
        distance = kernel.get_function("distance")
        distance(driver.In(trajectories), driver.In(num_stops), driver.In(stops),  driver.Out(dest), block=(1, 1, 1))

        with open(sys.argv[2], "w") as output_file:
            for line in dest.tolist():
                for element in line:
                    output_file.write(str(int(element)) + " ")
                output_file.write("\n")
Пример #2
0
def test_posteriors4():
    n_trajs = 1
    n_observations = 10
    fwdlattice = np.random.rand(n_observations, 4).astype(np.float32)
    bwdlattice = np.random.rand(n_observations, 4).astype(np.float32)
    posteriors = np.zeros_like(fwdlattice)
    mod.get_function('posteriors4')(drv.In(fwdlattice),
                                    drv.In(bwdlattice),
                                    np.int32(n_trajs),
                                    drv.In(
                                        np.array([n_observations],
                                                 dtype=np.int32)),
                                    drv.In(np.array([0], dtype=np.int32)),
                                    drv.Out(posteriors),
                                    block=(32, 1, 1),
                                    grid=(1, 1))
    print 'cuda'
    print posteriors

    print 'reference'
    gamma = fwdlattice + bwdlattice
    print np.exp(gamma.T - logsumexp(gamma, axis=1)).T
Пример #3
0
def main():
    multiply_them = mod.get_function("multiply_them")
    a = numpy.random.randn(400).astype(numpy.float32)
    b = numpy.random.randn(400).astype(numpy.float32)
    dest = numpy.zeros_like(a)
    multiply_them(drv.Out(dest),
                  drv.In(a),
                  drv.In(b),
                  block=(400, 1, 1),
                  grid=(1, 1))
    c = dest - a * b
    print(c)

    # Get the kernel code and specify the matrix size
    kcpKernel = XcpKernel % {'XiSize': XiSize}

    # Compile the kernel code
    aoMod = SourceModule(kcpKernel)

    # Get the kernel function from the compiled module
    aoMultiplyKernel = aoMod.get_function("MMultiplyKernel")

    # Define the input and result matrices
    kfX = numpy.random.random_sample((XiSize, XiSize)).astype(numpy.float32)
    kfY = numpy.random.random_sample((XiSize, XiSize)).astype(numpy.float32)
    kfR = numpy.zeros_like(kfX)
    kfR2 = numpy.matmul(kfX, kfY)

    # Execute the kernel
    aoMultiplyKernel(drv.Out(kfR),
                     drv.In(kfX),
                     drv.In(kfY),
                     block=(int(XiSize / 1), int(XiSize / 1), 4),
                     grid=(1, 1))

    print(kfX)
    print(kfY)
    print(kfR)
    print(kfR2)
Пример #4
0
def image_filter(image, filter, nums_thread=100):
    global ft_mod
    global g_filter
    if ft_mod is None:
        ft_mod = SourceModule(filter_source)
        g_filter = ft_mod.get_function("g_filter")
    image = np.array(image).astype(np.float64)
    filter = np.array(filter).astype(np.float64)
    if len(image.shape) != 4:
        image = image.reshape([1] + list(image.shape))
    if len(filter.shape) != 4:
        filter = filter.reshape([1] + list(filter.shape))
    parameters, outshape = build_parameters(image, filter, nums_thread)
    #print("outshape:",outshape,"from shape", image.shape)
    outputs = np.zeros(outshape, dtype=np.float64)
    g_filter(drv.In(image),
             drv.In(filter),
             drv.In(parameters),
             drv.Out(outputs),
             block=(nums_thread, 1, 1),
             grid=(1, 1))
    return outputs
Пример #5
0
    def predict(self, x):
        x = np.float32(x, order='C')
        y_pred = []
        distance = mod.get_function('distance')
        train_row, train_col = np.int32(self.x.shape)
        test_row, test_col = np.int32(x.shape)
        grid_x = int(math.ceil(train_col/self.thread_size))
        grid_y = int(math.ceil(test_row/self.thread_size))
        # print(grid_x, grid_y)
        dis_arr = np.zeros((test_row, train_col), dtype=np.float32)
        distance(
            cuda.In(x), cuda.In(self.x), cuda.Out(dis_arr), test_row, test_col, train_row, train_col, test_row, train_col,
            block=(self.thread_size, self.thread_size, 1), grid=(grid_x, grid_y)
        )
        # print(dis_arr)
        
        for i in range(len(dis_arr)):
            sorted_index = np.argsort(dis_arr[i])
            top_k_index = sorted_index[:self.k]
            y_pred.append(self._vote(ys=self.y[top_k_index]))

        return np.array(y_pred)
Пример #6
0
    def _draw(self, pts, colors):
        if not pts:
            return
        imsize = self.imsize

        dt0 = time()

        ind_count = zeros(self.imsize2, npint)
        colors = row_stack(colors).astype(npfloat)

        inds = concatenate(pts).astype(npint)
        _inds = cuda.mem_alloc(inds.nbytes)
        cuda.memcpy_htod(_inds, inds)

        aggn = inds.shape[0]
        self.cuda_agg(npint(aggn),
                      npint(imsize),
                      _inds,
                      cuda.InOut(ind_count),
                      block=(THREADS, 1, 1),
                      grid=(int(aggn // THREADS) + 1, 1))

        ind_count_map = _build_ind_count(ind_count)
        _ind_count_map = cuda.mem_alloc(ind_count_map.nbytes)
        cuda.memcpy_htod(_ind_count_map, ind_count_map)

        sort_colors = zeros((aggn, 4), npfloat)
        _sort_colors = cuda.mem_alloc(sort_colors.nbytes)
        cuda.memcpy_htod(_sort_colors, sort_colors)

        self.cuda_agg_bin(npint(aggn),
                          _ind_count_map,
                          cuda.In(colors),
                          _inds,
                          _sort_colors,
                          block=(THREADS, 1, 1),
                          grid=(int(aggn // THREADS) + 1, 1))

        dotn, _ = ind_count_map.shape
        self.cuda_dot(npint(dotn),
                      self._img,
                      _ind_count_map,
                      _sort_colors,
                      block=(THREADS, 1, 1),
                      grid=(int(dotn // THREADS) + 1, 1))

        if self.verbose is not None:
            print('-- drew dots: {:d}. time: {:0.4f}'.format(
                colors.shape[0],
                time() - dt0))
        self._updated = True
Пример #7
0
def calc_next_world_gpu(world, next_world, height, width):
    mod = SourceModule("""
    __global__ void life_game_gpu(const int* __restrict__ world, int *next_world, const int mat_size_y, const int mat_size_x){
        int mat_x = threadIdx.x + blockIdx.x * blockDim.x;
        int mat_y = threadIdx.y + blockIdx.y * blockDim.y;
        if (mat_x >= mat_size_x) {
            return;
        }
        if (mat_y >= mat_size_y) {
            return;
        }

        int current_value = world[(mat_y % mat_size_y) * mat_size_x + (mat_x % mat_size_x)];
        int next_value = current_value;
        int num_live = 0;
        num_live += world[((mat_y - 1) % mat_size_y) * mat_size_x + ((mat_x - 1) % mat_size_x)];
        num_live += world[((mat_y - 1) % mat_size_y) * mat_size_x + ((mat_x) % mat_size_x)];
        num_live += world[((mat_y - 1) % mat_size_y) * mat_size_x + ((mat_x + 1) % mat_size_x)];
        num_live += world[((mat_y) % mat_size_y) * mat_size_x + ((mat_x - 1) % mat_size_x)];
        num_live += world[((mat_y) % mat_size_y) * mat_size_x + ((mat_x + 1) % mat_size_x)];
        num_live += world[((mat_y + 1) % mat_size_y) * mat_size_x + ((mat_x - 1) % mat_size_x)];
        num_live += world[((mat_y + 1) % mat_size_y) * mat_size_x + ((mat_x) % mat_size_x)];
        num_live += world[((mat_y + 1) % mat_size_y) * mat_size_x + ((mat_x + 1) % mat_size_x)];
        
        if (current_value == 0 && num_live == 3)
            next_value = 1;
        else if (current_value == 1 && num_live >= 2 && num_live <= 3)
            next_value = 1;
        else
            next_value = 0;

        next_world[mat_y * mat_size_x + mat_x] = next_value;
        }
    """)

    life_game_gpu = mod.get_function("life_game_gpu")
    block = (BLOCKSIZE, BLOCKSIZE, 1)
    grid = ((width + block[0] - 1) // block[0],
            (height + block[1] - 1) // block[1])
    # print("Grid = ({0}, {1}), Block = ({2}, {3})".format(grid[0], grid[1], block[0], block[1]))

    # start = cuda.Event()
    # end = cuda.Event()

    # start.record()
    life_game_gpu(cuda.In(world),
                  cuda.Out(next_world),
                  numpy.int32(height),
                  numpy.int32(width),
                  block=block,
                  grid=grid)
Пример #8
0
def test():
    mask = cv2.imread('cur_mask.jpg')
    mask = mask[:, :, 0]
    # mask_g = mask[:,:,1]
    # mask_b = mask[:,:,2]

    x = []
    y = []
    prev_time = time.time()
    for i in range(len(mask)):
        for j in range(len(mask[0])):
            if mask[i][j] == 2:
                print('x = %d,y = %d' % (j, i))
                y.append(i)
                x.append(j)
    logger.info('process frame time:' + str(time.time() - prev_time))
    prev_time = time.time()
    x1 = min(x)
    y1 = min(y)
    x2 = max(x)
    y2 = max(y)
    logger.info('post process frame time:' + str(time.time() - prev_time))
    print(x1, y1, x2, y2)

    # w = numpy.int32(len(mask[0]))
    # h = numpy.int32(len(mask))
    w = numpy.int64(len(mask[0]))
    h = numpy.int64(len(mask))
    mask_np = numpy.array(mask)
    mask_np = mask_np.reshape(-1).astype(float)
    N = len(mask_np)
    # print(N)
    # print(w)
    # print(h)
    a = numpy.zeros(N, dtype=numpy.float)
    b = numpy.zeros(N, dtype=numpy.float)
    nTheads = 1024
    nBlocks = int((N + nTheads - 1) / nTheads)
    print("nBlocks:%d\n" % nBlocks)
    prev_time = time.time()
    func(drv.In(mask_np),
         drv.InOut(a),
         drv.InOut(b),
         w,
         h,
         block=(nTheads, 1, 1),
         grid=(nBlocks, ))
    logger.info('gpu process frame time:' + str(time.time() - prev_time))
    print(max(a))
    print(max(b))
    print(a)
Пример #9
0
def simulate_positions(module, Nobs, N, bounds, radius, d, dN, pa, ps, seed=666, Nthreads=64):
    
    Nphotons = Nobs*N
    print "Total Threads: %s" % Nphotons
    assert(Nphotons <= 1.1e8)

    d = np.uint32(d)
    dN = np.uint32(dN)
    radius = np.float32(radius)
    Ndoms = np.uint32(pow((d/dN)*2+1, 3) - 1)

    t1 = time.time()
    rng_states = get_rng_states(module, Nphotons, seed=seed)
    t2 = time.time()

    d_list = get_doms(module, Ndoms, radius, d, dN)
    t3 = time.time()

    x = np.random.uniform(bounds[0][0], bounds[0][1], N)
    y = np.random.uniform(bounds[1][0], bounds[1][1], N)
    z = np.random.uniform(bounds[2][0], bounds[2][1], N)
    # print x
    # print y
    # print z
    pInit = np.concatenate([x, y, z]).astype(np.float32)
    t4 = time.time()
    
    # print "t2-t1: ", t2-t1
    # print "t3-t2: ", t3-t2
    # print "t4-t3: ", t4-t3

    start = time.time()
    datahits = np.zeros(Ndoms*N, dtype=np.int32)
    datahitsNum = -np.ones(Nobs*N, dtype=np.int32)
    datatimes = np.zeros(Nphotons, dtype=np.float32)
    datapositions = np.zeros(Nphotons*3, dtype=np.float32)
    simulate = module.get_function('simulate_positions')

    simulate(np.uint64(Nphotons), np.uint64(Nobs), rng_states, d_list, cuda.InOut(datahits), cuda.InOut(datatimes), cuda.InOut(datahitsNum), 
             cuda.InOut(datapositions), cuda.In(pInit), np.float32(pa), np.float32(ps),np.uint32(Ndoms), 
             block=(Nthreads, 1, 1), grid=(Nphotons//Nthreads + 1, 1))    

    print "end-start", time.time() - start
    # print "sumHits: ", sum(datahits)
    # print 

    datahits = np.reshape(np.array(datahits, dtype=float), (N, Ndoms))
    datahitsNum = np.reshape(np.array(datahitsNum, dtype=float), (N, Nobs))
    datatimes = np.reshape(np.array(datatimes, dtype=float), (N, Nobs))
    pInit = np.reshape(pInit, (3, N)).T
    return datahits, datahitsNum, datatimes, pInit
Пример #10
0
    def score_df(self, thread_count):
        """Scores each collision using a scoring function that gives
        a score of 2 to each person that was killed, a score of 1
        to each person injured, and divides those two scores added
        up by an average of 20 people per accident, then multiplies
        that fraction by 5 for a severity score of 0-5"""

        mod = SourceModule("""
        __global__ void score_function(float *dest, float *killed, float *injured)
        {
            const int i = (blockIdx.x * blockDim.x) + threadIdx.x;
            dest[i] = (((killed[i] * 2.0) + injured[i]) / 8.0) * 5.0;
        }
        """)

        df = self.df[[
            'SCORE', 'LATITUDE', 'LONGITUDE', 'NUMBER OF PERSONS KILLED',
            'NUMBER OF PERSONS INJURED'
        ]].values.astype(np.float32)

        # Calculate kernel params
        n = len(df[:, 0])
        output = np.zeros_like(df[:, 0])
        thread_size = thread_count
        core_size = self.get_core_size(thread_count, n)

        # Run kernel
        score_function = mod.get_function("score_function")
        score_function(cuda.Out(output),
                       cuda.In(df[:, 3]),
                       cuda.In(df[:, 4]),
                       block=(thread_size, 1, 1),
                       grid=(core_size, 1))

        df[:, 0] = output

        # Only return score with lat/long
        return df[:, [0, 1, 2]]
Пример #11
0
    def gpuFunc(iterator):
        iterator = iter(iterator)
        cpu_data = np.asarray(list(iterator), dtype=np.float32)
        datasize = len(cpu_data)
        # * 3 for data dimensions. /256 for block size.
        gridNum = int(np.ceil(datasize / 256.0))

        # +1 for overprovisioning in case there is dangling threads
        centroids = np.empty(datasize, gpuarray.vec.float2)

        cuda.init()
        dev = cuda.Device(dev_id)
        contx = dev.make_context()

        # The GPU kernel below takes centroids IDs and 1-D data points in
        # form of float2 (x,y). X is for the centroid ID whereas Y is
        # the actual point coordinate.
        try:
            mod = SourceModule(cudakernel)
            func = mod.get_function("assignToCentroid")
            func(cuda.In(cpu_data),
                 cuda.In(kPoints),
                 np.int32(datasize),
                 np.int32(len(kPoints)),
                 cuda.Out(centroids),
                 block=(16, 16, 1),
                 grid=(gridNum, 1),
                 shared=0)
            closest = [(val[0], (np.asarray(val[1]), 1)) for val in centroids]
        except Exception as err:
            raise Exception("Error {} in node {}".format(
                err, socket.gethostname()))
        contx.pop()
        del cpu_data
        del datasize
        del centroids
        del contx
        return iter(closest)
def gpuinv3x3(inp, n):
    # internal constants not to be modified
    hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473,
            0x07061, 0x03140)
    # Convert parameters into numpy array
    # *** change next line between float32 and float64 to match float or double
    inpd = np.array(inp, dtype=np.float64)
    hpatd = np.array(hpat, dtype=np.uint32)
    # *** change next line between float32 and float64 to match float or double
    output = np.empty((n * 9), dtype=np.float64)
    # Get kernel function
    matinv3x3 = kernel_3x3.get_function("inv3x3")
    # Define block, grid and compute
    blockDim = (288, 1, 1)  # do not change
    gridDim = ((n / 32) + 1, 1, 1)
    # Kernel function
    matinv3x3(cuda.In(inpd),
              cuda.Out(output),
              np.uint64(n),
              cuda.In(hpatd),
              block=blockDim,
              grid=gridDim)
    return output
Пример #13
0
    def cal_cv(self, inputs):
        """
            calculate the constraint values for each individual
        """

        rows, cols = inputs.shape

        # prepare data
        k_layouts = np.float32(inputs).flatten()
        k_cvs = np.float32(np.zeros(rows))
        k_sx = np.float32(self.sx)
        k_sy = np.float32(self.sy)

        # pick out the function
        func = self.kernels.get_function("cal_cv_turb")
        func(drv.In(k_layouts),
             drv.Out(k_cvs),
             drv.In(k_sx),
             drv.In(k_sy),
             grid=(int(rows // 10 + 1), 1, 1),
             block=(10, 1, 1))

        return k_cvs
Пример #14
0
 def magnetic_field_at(magnet: Magnet, point: P3) -> P3:
     """
     magnet 在 point 处产生的磁场
     这个方法需要反复传输数据,速度比 CPU 慢
     """
     GPU_ACCELERETE.__compile()
     if isinstance(magnet, CCT):
         # point 转为局部坐标,并变成 numpy 向量
         p = magnet.local_coordinate_system.point_to_local_coordinate(
             point).to_numpy_ndarry3_float32()
         length = int(magnet.dispersed_path3.shape[0])
         winding = magnet.dispersed_path3.flatten().astype(numpy.float32)
         ret = numpy.zeros((3, ), dtype=numpy.float32)
         GPU_ACCELERETE.CUDA_MAGNETIC_FIELD_AT_CCT(
             drv.In(winding),
             drv.In(p),
             drv.In(numpy.array([length]).astype(numpy.int32)),
             drv.Out(ret),
             block=(512, 1, 1),
             grid=(256, 1),
         )
         print(p)
         return P3.from_numpy_ndarry(ret * magnet.current * 1e-7)
Пример #15
0
def gpu():
    oc = numpy.empty_like(ia, dtype=ia.dtype)
    blocks_per_grid = int((num + threads_per_block - 1) / threads_per_block)
    mod = SourceModule("""
    __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
    {
        int i = blockDim.x * blockIdx.x + threadIdx.x;

        if (i < numElements)
        {
            C[i] = sinf(A[i]) + sinf(B[i]);
        }
    }
    """)
    vec_add = mod.get_function("vectorAdd")
    vec_add(cuda.In(ia),
            cuda.In(ib),
            cuda.Out(oc),
            numpy.int32(num),
            block=(threads_per_block, 1, 1),
            grid=(blocks_per_grid, 1, 1),
            shared=0)
    return oc
    def compute_new_pendulum_states_amount_of_chaos_adaptive_step_size_method(
            self, currentStates, amountOfChaos, timeAlreadyExecuted,
            maxTimeToExecute, startFromDefaultState):
        logger.info('Computing new pendulum states with ' +
                    str(self.algorithm.name) + ' method')
        logger.info('Using the "amount of chaos" kernel')
        logger.info('time step: ' + str(self.timeStep) + ' seconds')
        logger.info('error tolerance: ' + str(self.errorTolerance))
        logger.info('amount of time already computed: ' +
                    str(timeAlreadyExecuted) + ' seconds')
        logger.info('max time to simulate: ' + str(maxTimeToExecute) +
                    ' seconds')
        logger.info('amount of time to simulate: ' +
                    str(maxTimeToExecute - timeAlreadyExecuted) + ' seconds')

        # Compute the double pendulum fractal image.
        logger.info('Running pendulum simulation kernel...')
        kernelStart = time.time()

        self.computeDoublePendulumFractalWithAmountOfChaosMethod(
            self.npFloatType(self.point1Mass),
            self.npFloatType(self.point2Mass),
            self.npFloatType(self.pendulum1Length),
            self.npFloatType(self.pendulum2Length),
            self.npFloatType(self.gravity),
            self.npFloatType(self.angle1Min),
            self.npFloatType(self.angle1Max),
            self.npFloatType(self.angle2Min),
            self.npFloatType(self.angle2Max),
            cuda.InOut(currentStates),
            cuda.In(amountOfChaos),
            np.int32(startFromDefaultState),
            self.npFloatType(timeAlreadyExecuted),
            np.int32(self.numberOfAnglesToTestX),
            np.int32(self.numberOfAnglesToTestY),
            self.npFloatType(self.timeStep),
            self.npFloatType(self.errorTolerance),
            self.npFloatType(maxTimeToExecute),
            # block=(1, 1, 1), grid=(1, 1))
            # block=(2, 2, 1), grid=(1, 1))
            # block=(4, 4, 1), grid=(4, 4))
            # block=(8, 8, 1), grid=(8, 8))
            block=(16, 16, 1),
            grid=(16, 16))
        # block=(32, 32, 1), grid=(32, 32))

        # Print the time it took to run the kernel.
        timeToExecuteLastKernel = time.time() - kernelStart
        logger.info('Completed pendulum simulation kernel in ' +
                    str(timeToExecuteLastKernel) + ' seconds')
def test_pycuda_only():
    """Run pycuda only example to test that pycuda works."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    # Test with pycuda in/out of numpy.ndarray
    a = np.random.randn(100).astype(np.float32)
    b = np.random.randn(100).astype(np.float32)
    dest = np.zeros_like(a)
    multiply_them(drv.Out(dest),
                  drv.In(a),
                  drv.In(b),
                  block=(400, 1, 1),
                  grid=(1, 1))
    assert (dest == a * b).all()
Пример #18
0
def run(path):
    # im = Image.open(path)
    # im.show()
    img1 = np.array(Image.open(path).convert('RGB'))  # 打开图像并转化为数字矩阵
    # print(type(img1))
    img2 = np.ones_like(img1) * 255
    # 通过cuda来调用gpu模块
    operatepic = mod.get_function('operatepic')
    test = mod.get_function('test')
    print(img1.shape)
    h, w, c = img1.shape
    k = 21  # 卷积核kernel
    b = Count * 10  # 块儿数
    t = 128  # 每个块儿线程数
    info = [h, w, k, b, t]
    info = np.int32(info)
    print(info)
    start = time.time()
    # 根据自己的GPU性能选择合适的block、grid,如果超出会报错
    for i in range(10):
        operatepic(drv.In(info), drv.In(img1), drv.Out(img2), block=(t, 1, 1), grid=(b, 1))
    stop = time.time()
    print((stop - start) / 10)
def avg_vote_gpu(nnf, img, patch_size):
    mod = SourceModule(open(
        os.path.join(package_directory, "GeneralizedPatchMatch.cu")).read(),
                       no_extern_c=True)
    avg_vote = mod.get_function("avg_vote")

    output = np.zeros([img.shape[-1], *nnf.shape[:2]], dtype=np.float32)
    threads = 20

    avg_vote(drv.In(xy_to_int(nnf)),
             drv.In(np.ascontiguousarray(img.transpose(2, 0, 1))),
             drv.InOut(output),
             np.int32(img.shape[-1]),
             np.int32(nnf.shape[0]),
             np.int32(nnf.shape[1]),
             np.int32(img.shape[0]),
             np.int32(img.shape[1]),
             np.int32(patch_size),
             block=(threads, threads, 1),
             grid=(get_blocks_for_dim(nnf.shape[1], threads),
                   get_blocks_for_dim(nnf.shape[0], threads)))

    return np.ascontiguousarray(output.transpose(1, 2, 0))
Пример #20
0
def remove_empty_anchor(view, anchors, limit):
    # input:
    # ahchors: (N, 4) 4->(y1, x1, y2, x2) (x > y)
    # view: (W, H, C) 

    mod = cuda.module_from_buffer(module_buff)
    func = mod.get_function('_Z12remove_emptyPfPiS_S0_S0_')

    anchors_shape = np.array(anchors.shape).astype(np.int32)
    view_shape = np.array(view.shape).astype(np.int32)
    index = np.zeros((anchors.shape[0], view_shape[2])).astype(np.float32)
    func(
        cuda.InOut(index), 
        cuda.In(anchors), 
        cuda.In(view), 
        cuda.In(anchors_shape), 
        cuda.In(view_shape), 
        block=(int(view_shape[2]), 1, 1),  # a thread <-> a value in a specific 2d pos(need to sum the channel)
        grid=(int(anchors_shape[0]), 50, 1)  # a grid <-> an anchor and a line(x)
        # 50 must > anchors width
    )
    index = np.sum(index, axis=1)
    return np.where(index > limit)[0]
 def interpolate(self, x):
     r = R.from_euler('xyz', x, degrees=True)
     for pos, i in zip(self.sensor_pos, range(self.number_of_sensors)):
         self.input[i] = r.apply(pos)
     self.interpol(np.int32(self.number_of_sensors),
                   drv.In(self.input),
                   drv.Out(self.output),
                   texrefs=[self.texref],
                   block=self.bdim,
                   grid=self.gdim)
     output_rot = np.zeros((self.number_of_sensors, 3))
     for i in range(self.number_of_sensors):
         output_rot[i] = r.inv().apply(self.output[i])
     return self.input, self.output, output_rot
Пример #22
0
def speed(n_samples, n_states, n_features, W1, W2):
    module = SourceModule(tpl.render(W1=W1, W2=W2))
    func = module.get_function('log_diag_mvn_likelihood')

    sequences = np.zeros((n_samples, n_features), dtype=np.float32)
    means = np.zeros((n_states, n_features), dtype=np.float32)
    variances = np.ones((n_states, n_features), dtype=np.float32)
    logvariances = np.ones((n_states, n_features), dtype=np.float32)
    loglikelihoods = np.zeros((n_samples, n_states), dtype=np.float32)

    N_THREADS = 4096

    start = cuda.Event()
    end = cuda.Event()

    start.record()
    func(cuda.In(sequences),
         cuda.In(means),
         cuda.In(variances),
         cuda.In(np.log(variances)),
         np.int32(n_samples),
         np.int32(n_states),
         np.int32(n_features),
         cuda.InOut(loglikelihoods),
         block=(W1 * W2, 1, 1),
         grid=(N_THREADS / (W1 * W2), 1))
    end.record()
    end.synchronize()

    return {
        'n_samples': n_samples,
        'n_states': n_states,
        'n_features': n_features,
        'W1': W1,
        'W2': W2,
        'time': np.around(start.time_till(end), decimals=3)
    }
Пример #23
0
def calc_next_world_gpu(world, next_world):
    height, width = world.shape
    ## CUDAカーネルを定義
    mod = SourceModule("""
    __global__ void get_next_world(int *world, int *nextWorld, int height, int width){
        int x = threadIdx.x + blockIdx.x * blockDim.x;
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        
        const int index = y * width + x;
        int current_value;
        int next_value;
        if (x >= width) {
            return;
        }
        if (y >= height) {
            return;
        }
        current_value = world[index];

        int numlive = 0;    
        numlive += world[((y - 1) % height ) * width + ((x - 1) % width)];
        numlive += world[((y - 1) % height ) * width + ( x      % width)]; 
        numlive += world[((y - 1) % height ) * width + ((x + 1) % width)]; 
        numlive += world[( y      % height ) * width + ((x - 1) % width)];
        numlive += world[( y      % height ) * width + ((x + 1) % width)];
        numlive += world[((y + 1) % height ) * width + ((x - 1) % width)];
        numlive += world[((y + 1) % height ) * width + ( x      % width)]; 
        numlive += world[((y + 1) % height ) * width + ((x + 1) % width)];

        if (current_value == 0 && numlive == 3){
            next_value = 1;
        }else if (current_value == 1 && numlive >= 2 && numlive <= 3){
            next_value = 1;
        }else{
            next_value = 0;
        }
        nextWorld[index] = next_value; 
    }
    """)
    set_next_cell_value_GPU = mod.get_function("get_next_world")
    block = (BLOCKSIZE, BLOCKSIZE, 1)
    grid = ((width + block[0] - 1) // block[0],
            (height + block[1] - 1) // block[1])
    set_next_cell_value_GPU(cuda.In(world),
                            cuda.Out(next_world),
                            numpy.int32(height),
                            numpy.int32(width),
                            block=block,
                            grid=grid)
Пример #24
0
def simulate_grid(module, Nobs, N, oversampling, datahits, datatimes, radius, d, dN, pa, ps, seed=666, Nthreads=64):
    
    Nruns = N*N
    Nphotons = Nobs*Nruns*oversampling
    Nobs *= oversampling
    print "Total Threads: %s" % Nphotons
    assert(Nphotons <= 1.1e8)

    d = np.uint32(d)
    dN = np.uint32(dN)
    radius = np.float32(radius)
    Ndoms = np.uint32(pow((d/dN)*2+1, 3) - 1)

    t1 = time.time()
    rng_states = get_rng_states(module, Nphotons, seed=seed)
    t2 = time.time()

    d_list = get_doms(module, Ndoms, radius, d, dN)
    t3 = time.time()
    x = np.linspace(-20, 20, N)
    y = np.linspace(-20, 20, N)
    X, Y = np.meshgrid(x, y)
    Z = np.zeros(N*N)
    pInit = np.concatenate([X.flatten(), Y.flatten(), Z]).astype(np.float32)
    t4 = time.time()
    
    print "t2-t1: ", t2-t1
    print "t3-t2: ", t3-t2
    print "t4-t3: ", t4-t3

    start = time.time()
    datahits = np.zeros(Ndoms*Nruns, dtype=np.int32)
    datatimesbinned = np.zeros(Ndoms*Nruns, dtype=np.float32)
    datatimes = np.zeros(Nphotons, dtype=np.float32)
    datapositions = np.zeros(Nphotons*3, dtype=np.float32)
    simulate = module.get_function('simulate_grid')

    simulate(np.uint64(Nphotons), np.uint64(Nobs), rng_states, d_list, cuda.InOut(datahits), cuda.InOut(datatimes), cuda.InOut(datatimesbinned), 
             cuda.InOut(datapositions), cuda.In(pInit), np.float32(pa), np.float32(ps),np.uint32(Ndoms), 
             block=(Nthreads, 1, 1), grid=(Nphotons//Nthreads + 1, 1))    

    print "end-start", time.time() - start
    print "sumHits: ", sum(datahits)
    print 

    datahits = np.reshape(np.array(datahits, dtype=float), (Nruns, Ndoms))/oversampling
    datatimesbinned = np.reshape(np.array(datatimesbinned, dtype=float), (Nruns, Ndoms))/oversampling

    return datahits, np.array(datatimes, dtype=float), datatimesbinned, datapositions
Пример #25
0
def onestepIteration(dist, timestep, maxit):
    """
    iterates the function image on a 2d grid through an euler anisotropic
    diffusion operator with timestep=timestep maxit number of times
    """
    image = 1 * dist
    forme = image.shape
    if (np.size(forme) > 2):
        sys.exit('Only works on gray images')

    aSize = forme[0] * forme[1]
    xdim = np.int32(forme[0])
    ydim = np.int32(forme[1])

    image[0, :] = image[1, :]
    image[xdim - 1, :] = image[xdim - 2, :]
    image[:, ydim - 1] = image[:, ydim - 2]
    image[:, 0] = image[:, 1]

    image = image.reshape(aSize, order='C').astype(np.float32)
    final = np.zeros(aSize).astype(np.float32)

    #reshaping the image matrix

    #block size: B := dim1*dim2*dim3=1024
    #gird size : dim1*dimr2*dim3 = ceiling(aSize/B)
    blockX = int(1024)
    multiplier = aSize / float(1024)
    if (aSize / float(1024) > int(aSize / float(1024))):
        gridX = int(multiplier + 1)
    else:
        gridX = int(multiplier)

    for k in range(0, maxit):
        diffIteration(drv.In(image),
                      drv.Out(final),
                      ydim,
                      xdim,
                      np.float32(timestep),
                      block=(blockX, 1, 1),
                      grid=(gridX, 1, 1))
        final = final.reshape(forme, order='C')
        final[0, :] = final[1, :]
        final[xdim - 1, :] = final[xdim - 2, :]
        final[:, ydim - 1] = final[:, ydim - 2]
        final[:, 0] = final[:, 1]
        image = final.reshape(aSize, order='C').astype(np.float32)

    return final.reshape(forme, order='C')
def hitungcuda(a5):
    # cuda.init()
    # device = cuda.Device(0)
    # ctx = device.make_context()
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
    __global__ void conv5(float *r5r, float *r5i, float *a5, float *f5r, float *f5i)
    {
        const int i = blockDim.x * blockIdx.x + threadIdx.x;
        const int j = blockDim.y * blockIdx.y + threadIdx.y;
        int Idx = i + j * blockDim.x * gridDim.x;
        r5r[Idx] = a5[Idx] * f5r[Idx];
        r5i[Idx] = a5[Idx] * f5i[Idx];
    }
    """)
    # ctx.pop()
    conv5 = mod.get_function("conv5")
    conv5(cuda.Out(r5r),
          cuda.Out(r5i),
          cuda.In(a5),
          cuda.In(f5r),
          cuda.In(f5i),
          block=(68, 4, 1),
          grid=(5, 5))
Пример #27
0
    def run_gpu(self, list_one, list_two, dimension, window):
        """ run CUDA GPU computing """
        sys.stdout.flush()
        list_one_float = np.array(list_one).astype(np.float32)
        list_two_float = np.array(list_two).astype(np.float32)
        dest = np.zeros_like(list_one_float)

        xdim = self.MAX_THREADS * self.capability[0]
        ydim = 1
        zdim = 1
        array_len = np.asarray(np.int32(len(dest)))

        blocks_per_grid = int(math.ceil(len(dest)/float(xdim)))

        self.cuda_exec_func(
            drv.Out(dest),
            drv.In(list_one_float),
            drv.In(list_two_float),
            drv.In(array_len),
            block=(xdim, ydim, zdim),
            grid=(blocks_per_grid, 1)
            )

        return dest.tolist()
Пример #28
0
    def cuda_hamming_dist(self, vec_a, vec_b):

        #dest = numpy.zeros_like(vec_b)
        dest = numpy.array(vec_b)
        length = numpy.array([vec_b.shape[0]]).astype(numpy.uint64)

        #for d in dest:
        #    print d

        custom_grid = (int(math.ceil(float(length[0]) / (50 * 256))), 1)
        print "custom grid: ", custom_grid

        self.hamming_dist(drv.In(vec_a),
                          drv.InOut(dest),
                          drv.In(length),
                          block=self.block,
                          grid=custom_grid)

        print dest
        #for d in dest:
        #    print d
        print dest.shape

        return dest
Пример #29
0
def compute_hist(values, bins):
    hist = np.zeros(bins).astype(np.int32)

    hist_func = mod.get_function("hist")
    block = (128, 1, 1)
    grid = (int((len(values) + block[0] - 1) / block[0]), 1, 1)

    hist_func(cuda.In(values),
              np.int32(len(values)),
              cuda.InOut(hist),
              np.int32(bins),
              grid=grid,
              block=block)

    return hist
Пример #30
0
    def __call__(self, Qdrift, Tdrift, dT, Pdrift, dP, **kwds):

        depos = numpy.vstack(
            (Qdrift.cpu().numpy(), Tdrift.cpu().numpy(), dT.cpu().numpy(),
             Pdrift.cpu().numpy(), dP.cpu().numpy())).T.copy(order='C')
        ndepos = depos.shape[0]

        tbeg = self.tb.bin_trunc(Tdrift - self.nsigma * dT).cpu().numpy()
        tend = self.tb.bin_trunc(Tdrift + self.nsigma * dT).cpu().numpy()
        pbeg = self.pb.bin_trunc(Pdrift - self.nsigma * dP).cpu().numpy()
        pend = self.pb.bin_trunc(Pdrift + self.nsigma * dP).cpu().numpy()

        tnum = tend - tbeg + 1
        pnum = pend - pbeg + 1

        Nticks = numpy.zeros(ndepos) + self.shape[1]

        offsets = numpy.vstack((tbeg, pbeg, Nticks))
        offsets = offsets.T.copy(order='C')

        out = numpy.zeros(self.shape, dtype=numpy.float32)

        for idepo in range(ndepos):
            depo = depos[idepo]
            offset = offsets[idepo]
            block = (int(tnum[idepo]), int(pnum[idepo]), 1)
            #print (idepo, block, offset)
            print(idepo, depo)
            self.meth(drv.InOut(out),
                      drv.In(self.bindesc),
                      drv.In(offset),
                      drv.In(depo),
                      block=block)
        t1 = time.time()
        print(t1 - t0)
        return out