Пример #1
0
    def allocate_buffered_data_arrays(self, **kwargs):
        n0 = kwargs.get('n0', self.n0)
        if self.buffered_transfer:
            n0 = kwargs.get('n0_buffer', self.n0_buffer)
        assert (n0 is not None)

        kw = dict(dtype=self.real_type, alignment=resource.getpagesize())

        self.t = cuda.aligned_zeros(shape=(n0, ), **kw)
        self.t = cuda.register_host_memory(self.t)

        self.y = cuda.aligned_zeros(shape=(n0, ),
                                    dtype=self.ytype,
                                    alignment=resource.getpagesize())

        self.y = cuda.register_host_memory(self.y)
        if self.weighted:
            self.dy = cuda.aligned_zeros(shape=(n0, ), **kw)
            self.dy = cuda.register_host_memory(self.dy)

        if self.balanced_magbins:
            self.mag_bwf = cuda.aligned_zeros(shape=(self.mag_bins, ), **kw)
            self.mag_bwf = cuda.register_host_memory(self.mag_bwf)

        if self.compute_log_prob:
            self.mag_bin_fracs = cuda.aligned_zeros(shape=(self.mag_bins, ),
                                                    **kw)
            self.mag_bin_fracs = cuda.register_host_memory(self.mag_bin_fracs)
        return self
Пример #2
0
    def allocate_buffered_data_arrays(self, **kwargs):
        """
        Allocates pinned memory for lightcurves if we're reusing
        this container
        """
        n0 = kwargs.get('n0', self.n0)
        if self.buffered_transfer:
            n0 = kwargs.get('n0_buffer', self.n0_buffer)
        assert (n0 is not None)

        self.t = cuda.aligned_zeros(shape=(n0, ),
                                    dtype=self.real_type,
                                    alignment=resource.getpagesize())
        self.t = cuda.register_host_memory(self.t)

        self.yw = cuda.aligned_zeros(shape=(n0, ),
                                     dtype=self.real_type,
                                     alignment=resource.getpagesize())
        self.yw = cuda.register_host_memory(self.yw)

        self.w = cuda.aligned_zeros(shape=(n0, ),
                                    dtype=self.real_type,
                                    alignment=resource.getpagesize())
        self.w = cuda.register_host_memory(self.w)
        return self
Пример #3
0
def initialize(num_points):
    # states 
    ## Note: It is important to keep the parallelizable index (largest)
    ## on the most inner dimension
    state = cuda.aligned_zeros((num_moments, num_points), dtype=np.float32)

    wts_left, wts_right, xi_left, xi_right = jet_initialize_moments(num_coords, num_nodes)


    grid_spacing = 1/ (num_points - 2)
    disc_loc = 0.125
    n_pt = num_points - 2
    disc_idx = int(n_pt * disc_loc) - 2
    print('Dislocation index is ', disc_idx, ' out of ', n_pt, ' points')
    # print("abscissas left: ", xi_left[0,:])
    # print("abscissas right: ", xi_right[0,:])

    # Populate state
    moments_left = projection(wts_left, xi_left, indices,
            num_coords, num_nodes)
    moments_right = projection(wts_right, xi_right, indices,
            num_coords, num_nodes)

    state[:, :disc_idx] = np.asarray([moments_left]).T
    state[:, -disc_idx:] = np.asarray([moments_right]).T

    state[:, 0] = np.asarray([moments_right])
    state[:, -1] = np.asarray([moments_left])

    return state, grid_spacing
Пример #4
0
    def allocate(self, data):
        if len(data) > len(self.streams):
            self._create_streams(len(data) - len(self.streams))

        gpu_data, pow_cpus = [], []

        for t, y, w, freqs in data:

            pow_cpu = cuda.aligned_zeros(shape=(len(freqs), ),
                                         dtype=np.float32,
                                         alignment=resource.getpagesize())

            pow_cpu = cuda.register_host_memory(pow_cpu)

            t_g, y_g, w_g = None, None, None
            if len(t) > 0:
                t_g, y_g, w_g = tuple([
                    gpuarray.zeros(len(t), dtype=np.float32) for i in range(3)
                ])

            pow_g = gpuarray.zeros(len(pow_cpu), dtype=pow_cpu.dtype)
            freqs_g = gpuarray.to_gpu(np.asarray(freqs).astype(np.float32))

            gpu_data.append((t_g, y_g, w_g, freqs_g, pow_g))
            pow_cpus.append(pow_cpu)
        return gpu_data, pow_cpus
Пример #5
0
    def _init_memory(self) -> None:
        '''
        Initialize GPU memory 

        each GPU gets its own number of streams, and each stream gets its own
        memory allocation
        '''
        # initialize memory lists

        self.moments_device = [[]] * self.num_device
        self.moment_chunk_host = [[]] * self.num_device
        self.x_chunk_host = [[]] * self.num_device
        self.y_chunk_host = [[]] * self.num_device
        self.w_chunk_host = [[]] * self.num_device
        self.x_device = [[]] * self.num_device
        self.y_device = [[]] * self.num_device
        self.w_device = [[]] * self.num_device

        self.c_moments = [[]] * self.num_device
        self.mu = [[]] * self.num_device
        self.yf = [[]] * self.num_device

        self.m1 = [[]] * self.num_device
        self.x1 = [[]] * self.num_device
        self.w1 = [[]] * self.num_device
        self.x2 = [[]] * self.num_device
        self.w2 = [[]] * self.num_device

        # Host memory that stores the output
        self.w_out = cuda.aligned_zeros((4, self.in_size), dtype=np.float32)
        self.x_out = cuda.aligned_zeros((4, self.in_size), dtype=np.float32)
        self.y_out = cuda.aligned_zeros((4, self.in_size), dtype=np.float32)

        self.streams = [[]] * self.num_device
        # number of input allocated to each thread
        size_per_thread = np.ceil(self.in_size / self.num_device)

        mem_thread = []
        for i, ctx in enumerate(self.context_list):
            mem_thread.append(
                threading.Thread(target=self._init_thread_memory,
                                 args=(i, ctx, size_per_thread)))
            mem_thread[i].start()

        for t in mem_thread:
            t.join()
Пример #6
0
def init_moment_10(size: int):
    one_moment = np.asarray(
        [1, 1, 1, 1.01, 1, 1.01, 1.03, 1.03, 1.0603, 1.0603], dtype=np.float32)
    moments = cuda.aligned_zeros((10, size), dtype=np.float32)
    for i in range(size):
        moments[:, i] = one_moment

    return moments
Пример #7
0
def init_moment_6(size: int):
    '''
    Initialize a dummy input of specified size for Chyqmom4
    '''
    one_moment = np.asarray([1.0, 1.0, 1.0, 1.01, 1, 1.01], dtype=np.float32)
    moments = cuda.aligned_zeros((6, size), dtype=np.float32)
    for i in range(size):
        moments[:, i] = one_moment

    return moments
Пример #8
0
    def allocate_pinned_cpu(self, **kwargs):
        nf = kwargs.get('nf', self.nf)
        assert (nf is not None)

        self.ce_c = cuda.aligned_zeros(shape=(nf, ),
                                       dtype=self.real_type,
                                       alignment=resource.getpagesize())
        self.ce_c = cuda.register_host_memory(self.ce_c)

        return self
Пример #9
0
    def allocate_pinned_cpu(self, **kwargs):
        """ Allocates pinned CPU memory for asynchronous transfer of result """
        nf = kwargs.get('nf', self.nf)
        assert(nf is not None)

        self.lsp_c = cuda.aligned_zeros(shape=(nf,), dtype=self.real_type,
                                        alignment=resource.getpagesize())
        self.lsp_c = cuda.register_host_memory(self.lsp_c)

        return self
Пример #10
0
    def allocate_pinned_arrays(self, nfreqs=None, ndata=None):
        if nfreqs is None:
            nfreqs = int(self.max_nfreqs)
        if ndata is None:
            ndata = int(self.max_ndata)

        self.bls = cuda.aligned_zeros(shape=(nfreqs,),
                                      dtype=self.rtype,
                                      alignment=resource.getpagesize())
        self.bls = cuda.register_host_memory(self.bls)

        self.nbins0 = cuda.aligned_zeros(shape=(nfreqs,),
                                         dtype=np.int32,
                                         alignment=resource.getpagesize())
        self.nbins0 = cuda.register_host_memory(self.nbins0)

        self.nbinsf = cuda.aligned_zeros(shape=(nfreqs,),
                                         dtype=np.int32,
                                         alignment=resource.getpagesize())
        self.nbinsf = cuda.register_host_memory(self.nbinsf)

        self.t = cuda.aligned_zeros(shape=(ndata,),
                                    dtype=self.rtype,
                                    alignment=resource.getpagesize())
        self.t = cuda.register_host_memory(self.t)

        self.yw = cuda.aligned_zeros(shape=(ndata,),
                                     dtype=self.rtype,
                                     alignment=resource.getpagesize())
        self.yw = cuda.register_host_memory(self.yw)

        self.w = cuda.aligned_zeros(shape=(ndata,),
                                    dtype=self.rtype,
                                    alignment=resource.getpagesize())
        self.w = cuda.register_host_memory(self.w)
Пример #11
0
import pycuda.autoinit
import numpy as np

if __name__ == "__main__":

    res_file_name = 'result_pycuda.csv'
    max_input_size_mag = 6
    num_points = 200
    trial = 5

    result = np.zeros((num_points, trial + 1))

    for idx, in_size in enumerate(np.logspace(0, max_input_size_mag, num=num_points)):
        this_result = np.zeros(trial + 1)
        this_result[0] = int(in_size)

        w = cuda.aligned_zeros((9, int(in_size)), dtype=np.float32)
        x = cuda.aligned_zeros((9, int(in_size)), dtype=np.float32)
        y = cuda.aligned_zeros((9, int(in_size)), dtype=np.float32)

        this_moment = init_moment_10(int(in_size))
        for i in range(1, trial, 1):
            this_result[i] = chyqmom9_pycuda(this_moment, int(in_size), w, x, y, 1)
        result[idx] = this_result
        print(int(in_size), ": ", this_result[1])
    np.savetxt(res_file_name, result, delimiter=',')




Пример #12
0
def chyqmom27(
    moments: np.ndarray, 
    size: int):

    mem_d_size_in_byte = np.ones(size).astype(np.float32).nbytes
    sizeof_float = np.int32(np.dtype(np.float32).itemsize)
    size = np.int32(size)

    BlockSize = (256, 1, 1)
    GridSize = (size +BlockSize[0] - 1) /BlockSize[0];
    GridSize = (int(GridSize), 1, 1)

    # compile kernel
    HYQ = SourceModule(HYQMOM)
    CHY27 = SourceModule(CHYQMOM27)
    hyqmom3 = HYQ.get_function('hyqmom3')

    c_kernel = CHY27.get_function('chyqmom27_cmoments')
    chyqmom27_rho_yf = CHY27.get_function('chyqmom27_rho_yf')
    chyqmom27_zf = CHY27.get_function('chyqmom27_zf')
    chyqmom27_mu = CHY27.get_function('chyqmom27_mu')
    float_value_set = CHY27.get_function('float_value_set')
    float_array_set = CHY27.get_function('float_array_set')
    chyqmom27_set_m = CHY27.get_function('chyqmom27_set_m')
    print_device = CHY27.get_function('print_device')
    chyqmom27_wout = CHY27.get_function('chyqmom27_wout')
    chyqmom27_xout = CHY27.get_function('chyqmom27_xout')
    chyqmom27_yout = CHY27.get_function('chyqmom27_yout')
    chyqmom27_zout = CHY27.get_function('chyqmom27_zout')

    w = cuda.aligned_zeros((27, int(size)), dtype=np.float32)
    x = cuda.aligned_zeros((27, int(size)), dtype=np.float32)
    y = cuda.aligned_zeros((27, int(size)), dtype=np.float32)
    z = cuda.aligned_zeros((27, int(size)), dtype=np.float32)

    # Allocate memory 
    moments_device = cuda.mem_alloc(int(sizeof_float * size * 16))
    c_moments = cuda.mem_alloc(int(sizeof_float * size * 12))

    m = cuda.mem_alloc(int(sizeof_float * size * 10))
    float_value_set(m, np.float32(1), size, np.int32(0), block=BlockSize, grid=GridSize)
    float_value_set(m, np.float32(0), size, size, block=BlockSize, grid=GridSize)

    w1 = cuda.mem_alloc(int(sizeof_float * size * 3))
    x1 = cuda.mem_alloc(int(sizeof_float * size * 3))

    w2 = cuda.mem_alloc(int(sizeof_float * size * 9))
    x2 = cuda.mem_alloc(int(sizeof_float * size * 9))
    y2 = cuda.mem_alloc(int(sizeof_float * size * 9))

    rho = cuda.mem_alloc(int(sizeof_float * size * 9))
    yf = cuda.mem_alloc(int(sizeof_float * size * 3))
    yp = cuda.mem_alloc(int(sizeof_float * size * 9))
    zf = cuda.mem_alloc(int(sizeof_float * size * 3))

    w3 = cuda.mem_alloc(int(sizeof_float * size * 3))
    x3 = cuda.mem_alloc(int(sizeof_float * size * 3))

    mu = cuda.mem_alloc(int(sizeof_float * size * 3))

    w_dev = cuda.mem_alloc(int(sizeof_float * size * 27))
    x_dev = cuda.mem_alloc(int(sizeof_float * size * 27))
    y_dev = cuda.mem_alloc(int(sizeof_float * size * 27))
    z_dev = cuda.mem_alloc(int(sizeof_float * size * 27))

    cuda.memcpy_htod(moments_device, moments)
    # Is this faster? 

    time_before = cuda.Event()
    time_after = cuda.Event()

    time_before.record()

    c_kernel(moments_device, c_moments, size, block=BlockSize, grid=GridSize)
    float_array_set(m, c_moments, size, np.int32(2) * size, np.int32(0), block=BlockSize, grid=GridSize)
    float_array_set(m, c_moments, size, np.int32(3) * size, np.int32(6) * size, block=BlockSize, grid=GridSize)
    float_array_set(m, c_moments, size, np.int32(4) * size, np.int32(9) * size, block=BlockSize, grid=GridSize)

    # print("What is m1?")
    # print_device(m, np.int32(5), block=BlockSize, grid=GridSize)

    hyqmom3(m, x1, w1, size, block=BlockSize, grid=GridSize)

    # Is this faster? 
    chyqmom27_set_m(m, c_moments, size, block=BlockSize, grid=GridSize)

    # this_context.synchronize()
    # print_device(m, np.int32(10), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering CHYQMOM9")
    chyqmom9(m, size, w2, x2, y2)

    # this_context.synchronize()
    # print("What is w2?")
    # print_device(w2, np.int32(10), block=BlockSize, grid=GridSize)


    chyqmom27_rho_yf(c_moments, y2, w2, rho, yf, yp, size, block=BlockSize, grid=GridSize)
    chyqmom27_zf(c_moments, x1, zf, size, block=BlockSize, grid=GridSize) 
    chyqmom27_mu(c_moments, rho, zf, mu, size, block=BlockSize, grid=GridSize)

    float_array_set(m, mu, size, np.int32(2) * size, np.int32(0), block=BlockSize, grid=GridSize)
    float_array_set(m, mu, size, np.int32(3) * size, np.int32(1) * size, block=BlockSize, grid=GridSize)
    float_array_set(m, mu, size, np.int32(4) * size, np.int32(2) * size, block=BlockSize, grid=GridSize)
    hyqmom3(m, x3, w3, size, block=BlockSize, grid=GridSize)

    chyqmom27_wout(moments_device, w1, rho, w3, w_dev, size, block=BlockSize, grid=GridSize)
    chyqmom27_xout(moments_device, x1, x_dev, size, block=BlockSize, grid=GridSize)
    chyqmom27_yout(moments_device, yf, yp, y_dev, size, block=BlockSize, grid=GridSize)
    chyqmom27_zout(moments_device, zf, x3, z_dev, block=BlockSize, grid=GridSize)

    time_after.record()
    time_after.synchronize()
    elapsed_time =  time_after.time_since(time_before)
    
    cuda.memcpy_dtoh(w, w_dev)
    cuda.memcpy_dtoh(x, x_dev)
    cuda.memcpy_dtoh(y, y_dev)
    cuda.memcpy_dtoh(z, z_dev)

    # this_context.synchronize()
    # print("Entering rho")
    # print_device(rho, np.int32(9*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering mu")
    # print_device(mu, np.int32(3*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering w1")
    # print_device(w1, np.int32(3*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering rho")
    # print_device(rho, np.int32(9*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering w3")
    # print_device(w3, np.int32(3*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Final w_dev")
    # print_device(w_dev, np.int32(27*1), block=BlockSize, grid=GridSize)

    moments_device.free()
    c_moments.free()

    m.free()
    w1.free()
    x1.free()

    w2.free()
    x2.free()
    y2.free()

    rho.free()
    yf.free()
    yp.free()
    zf.free()

    w3.free()
    x3.free()

    mu.free()

    return elapsed_time, w_dev, x_dev, y_dev, z_dev
Пример #13
0
def single_advance_gpu(state, num_points, grid_space):

    rhs = cuda.aligned_zeros((num_moments, num_points), dtype=np.float32)
    time_before = cuda.Event()
    time_1 = cuda.Event()
    time_after = cuda.Event()
    ## allocate GPU memory 
    indices_device = cuda.mem_alloc_like(indices)
    cuda.memcpy_htod(indices_device, indices)

    f_min = cuda.mem_alloc(int(sizeof_float * 
                num_moments * num_nodes * num_points))
    f_max = cuda.mem_alloc(int(sizeof_float * 
                num_moments*num_nodes*num_points))
    
    flux_1 = cuda.mem_alloc_like(state)
    flux_2 = cuda.mem_alloc_like(state)

    ## compile GPU kernel 
    BlockSize = (256, 1, 1)
    GridSize = (num_points +BlockSize[0] - 1) /BlockSize[0];
    GridSize = (int(GridSize), 1, 1)

    domain_get_flux = QUAD.get_function('domain_get_flux_3d')
    fsum = QUAD.get_function('fsum_3d')
    flux_out = QUAD.get_function('flux_3d')
    ## compute_rhs 

    time_before.record()
    # grid_inversion(state)
    # output are pointer object to GPU memory 
    _, w, x, y, z = chyqmom27(state, num_points)

    time_1.record()

    # domain_get_fluxes(weights, abscissas, qbmm_mgr.indices,
    #                 num_points, qbmm_mgr.num_moments,
    #                 qbmm_mgr.num_nodes, flux)
    domain_get_flux(w, x, y, z, indices_device,
                    f_min, f_max, 
                    np.int32(num_moments), 
                    np.int32(num_nodes), 
                    np.int32(num_points),
                    block=BlockSize, grid=GridSize)

    fsum(flux_1, f_min, f_max, 
                    np.int32(num_moments), 
                    np.int32(num_nodes), 
                    np.int32(num_points),
                    block=BlockSize, grid=GridSize)
    flux_out(flux_1, flux_2, np.float32(grid_space), 
                    np.int32(num_moments), 
                    np.int32(num_points),
                    block=BlockSize, grid=GridSize)
    
    time_after.record()
    time_1.synchronize()
    time_after.synchronize()

    total_time = time_after.time_since(time_before)
    quad_time = time_after.time_since(time_1)
    
    cuda.memcpy_dtoh(rhs, flux_2)
    w.free()
    x.free()
    y.free()
    z.free()
    return rhs, total_time, quad_time
Пример #14
0
    moments = cuda.aligned_zeros((10, size), dtype=np.float32)
    for i in range(size):
        moments[:, i] = one_moment

    return moments


if __name__ == '__main__':
    num_moments = 10000000
    batch_size = 4
    moments = init_moment_10(num_moments)
    # flatten to 1d array
    # moments = moments.flatten()

    # outputs
    w = cuda.aligned_zeros((9, num_moments), dtype=np.float32)
    x = cuda.aligned_zeros((9, num_moments), dtype=np.float32)
    y = cuda.aligned_zeros((9, num_moments), dtype=np.float32)

    time1 = chyqmom9_pycuda(moments, num_moments, w, x, y, batch_size)
    # time2 = chyqmom9_pycuda(moments, num_moments, w, x, y, batch_size)
    # time3 = chyqmom9_pycuda(moments, num_moments, w, x, y, batch_size)
    # time4 = chyqmom9_pycuda(moments, num_moments, w, x, y, batch_size)
    print("Done")

    # for j in range(num_moments):
    #     try:
    #         if np.abs(w[0, j] - 0.027791) > 1e-3: raise ValueError
    #         if np.abs(w[1, j] - 0.111124) > 1e-3: raise ValueError
    #         if np.abs(w[2, j] - 0.027791) > 1e-3: raise ValueError
    #         if np.abs(w[3, j] - 0.111124) > 1e-3: raise ValueError