Exemplo n.º 1
0
    def _concat(cls, objs):
        head = objs[0]
        for o in objs:
            if not o.is_type_equivalent(head):
                raise ValueError("All series must be of same type")

        newsize = sum(map(len, objs))
        # Concatenate data
        mem = cuda.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem)
        for o in objs:
            data.extend(o.data.to_gpu_array())

        # Concatenate mask if present
        if all(o.has_null_mask for o in objs):
            # FIXME: Inefficient
            mem = cuda.device_array(shape=newsize, dtype=np.bool)
            mask = Buffer.from_empty(mem)
            null_count = 0
            for o in objs:
                mask.extend(o._get_mask_as_series().to_gpu_array())
                null_count += o._null_count
            mask = Buffer(utils.boolmask_to_bitmask(mask.to_array()))
        else:
            mask = None
            null_count = 0

        col = head.replace(data=data, mask=mask, null_count=null_count)
        return col
Exemplo n.º 2
0
    def test_profiling(self):
        with cuda._profiling():
            a = cuda.device_array(10)
            del a

        with cuda._profiling():
            a = cuda.device_array(100)
            del a
Exemplo n.º 3
0
def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None,
                          MAX_TPB = 512, stream = None):
    """
    All input (except MAX_TPB and stream) are device arrays.
    edges       : array with the IDs of the edges that will be part of the new graph
    n_edges     : array of 1 element with the number of valid edges in the edges array;
                  if n_edges < size of edges, the last elements of the edges array are
                  not considered
    """

    # check if number of valid edges was received
    if n_edges is None:
        edges_size = edges.size
        n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32))
    else:
        edges_size = int(n_edges.getitem(0))

    # check if a stream was received, if not create one
    if stream is None:
        myStream = cuda.stream()
    else:
        myStream = stream
    
    new_n_edges = edges_size * 2

    # allocate memory for new graph
    ndest = cuda.device_array(new_n_edges, dtype = dest.dtype,
                              stream = myStream)
    nweight = cuda.device_array(new_n_edges, dtype = weight.dtype,
                                stream = myStream)
    nfe = cuda.device_array_like(fe, stream = myStream)
    nod = cuda.device_array_like(od, stream = myStream)

    # fill new outdegree with zeros
    vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB)
    memSet[vertexGrid, MAX_TPB, myStream](nod, 0)

    # count all edges of new array and who they belong to
    edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB)
    countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod)

    # get new first_edge array from new outdegree
    nfe.copy_to_device(nod, stream=myStream)
    ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream)


    # copy new first_edge to top_edge to serve as pointer in adding edges
    top_edge = cuda.device_array_like(nfe, stream = myStream)
    top_edge.copy_to_device(nfe, stream = myStream)

    addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od,
                                          top_edge, ndest, nweight)

    del top_edge
    #del dest, weight, fe, od
    return ndest, nweight, nfe, nod
Exemplo n.º 4
0
def arange(start, stop=None, step=1, dtype=np.int64):
    if stop is None:
        start, stop = 0, start
    size = (stop - start + (step - 1)) // step
    out = cuda.device_array(size, dtype=dtype)
    gpu_arange.forall(size)(start, size, step, out)
    return out
Exemplo n.º 5
0
def gather(data, index, out=None):
    """Perform ``out = data[index]`` on the GPU
    """
    if out is None:
        out = cuda.device_array(shape=index.size, dtype=data.dtype)
    gpu_gather.forall(index.size)(data, index, out)
    return out
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = PRNG(PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
Exemplo n.º 7
0
    def test_gufunc_stream(self):
        #cuda.driver.flush_pending_free()
        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        ts = time()
        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        tcuda = time() - ts

        ts = time()
        Gold = ut.matrix_multiply(A, B)
        tcpu = time() - ts

        stream_speedups.append(tcpu / tcuda)

        self.assertTrue(np.allclose(C, Gold))
Exemplo n.º 8
0
def prescan_test():

    a = np.arange(2048).astype(np.int32)
    reference = np.empty_like(a)

    ref_sum = scan.exprefixsumNumba(a, reference)

    a1 = np.arange(1024).astype(np.int32)
    a2 = np.arange(1024, 2048).astype(np.int32)

    ref1 = np.empty_like(a1)
    ref2 = np.empty_like(a2)

    ref_sum1 = scan.exprefixsumNumba(a1, ref1)
    ref_sum2 = scan.exprefixsumNumba(a2, ref2)

    dAux = cuda.device_array(2, dtype = np.int32)
    dA = cuda.to_device(a)

    sm_size = 1024 * a.dtype.itemsize

    scan.prescan[2, 512, 0, sm_size](dA, dAux)

    aux = dAux.copy_to_host()
    a_gpu = dA.copy_to_host()

    print "finish"
Exemplo n.º 9
0
def sum_parts(data):
    """
    Driver for ``gpu_single_block_sum`` kernel
    """
    arr = np.asarray(data, dtype=np.float32)
    out = cuda.device_array(1, dtype=np.float32)
    gpu_single_block_sum[1, gpu_block_sum_max_blockdim](arr, out)
    return out.copy_to_host()[0]
Exemplo n.º 10
0
def astype(ary, dtype):
    if ary.dtype == np.dtype(dtype):
        return ary
    else:
        out = cuda.device_array(shape=ary.shape, dtype=dtype)
        configured = gpu_copy.forall(out.size)
        configured(ary, out)
        return out
Exemplo n.º 11
0
def apply_reduce(fn, inp):
    # allocate output+temp array
    outsz = libgdf.gdf_reduce_optimal_output_size()
    out = cuda.device_array(outsz, dtype=inp.dtype)
    # call reduction
    fn(inp.cffi_view, unwrap_devary(out), outsz)
    # return 1st element
    return out[0]
Exemplo n.º 12
0
 def __init__(self, shape, dtype, prealloc):
     self.device = cuda.get_current_device()
     self.freelist = deque()
     self.events = {}
     for i in range(prealloc):
         gpumem = cuda.device_array(shape=shape, dtype=dtype)
         self.freelist.append(gpumem)
         self.events[gpumem] = cuda.event(timing=False)
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in range(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
Exemplo n.º 14
0
 def test_stream_bind(self):
     stream = cuda.stream()
     with stream.auto_synchronize():
         arr = cuda.device_array(
             (3, 3),
             dtype=np.float64,
             stream=stream)
         self.assertEqual(arr.bind(stream).stream, stream)
         self.assertEqual(arr.stream, stream)
Exemplo n.º 15
0
def mask_assign_slot(size, mask):
    # expand bits into bytes
    dtype = (np.int32 if size < 2 ** 31 else np.int64)
    expanded_mask = cuda.device_array(size, dtype=dtype)
    numtasks = min(64 * 128, expanded_mask.size)
    gpu_expand_mask_bits.forall(numtasks)(mask, expanded_mask)

    # compute prefixsum
    slots = prefixsum(expanded_mask)
    sz = int(slots[slots.size - 1])
    return slots, sz
Exemplo n.º 16
0
    def test_device_array_interface(self):
        dary = cuda.device_array(shape=100)
        devicearray.verify_cuda_ndarray_interface(dary)

        ary = np.empty(100)
        dary = cuda.to_device(ary)
        devicearray.verify_cuda_ndarray_interface(dary)

        ary = np.asarray(1.234)
        dary = cuda.to_device(ary)
        self.assertEquals(dary.ndim, 1)
        devicearray.verify_cuda_ndarray_interface(dary)
Exemplo n.º 17
0
    def test_event_elapsed(self):
        N = 32
        dary = cuda.device_array(N, dtype=np.double)
        evtstart = cuda.event()
        evtend = cuda.event()

        evtstart.record()
        cuda.to_device(np.arange(N), to=dary)
        evtend.record()
        evtend.wait()
        evtend.synchronize()
        print(evtstart.elapsed_time(evtend))
Exemplo n.º 18
0
def find_segments(arr):
    """Find beginning indices of runs of equal values.

    Returns
    -------
    starting_indices : device array
        The starting indices of start of segments.
        Total segment count will be equal to the length of this.
    """
    from . import _gdf

    # Compute diffs of consecutive elements
    markers = cuda.device_array(arr.size, dtype=np.int32)
    gpu_mark_segment_begins.forall(markers.size)(arr, markers)
    # Compute index of marked locations
    slots = prefixsum(markers)
    ct = slots[slots.size - 1]
    scanned = slots[:-1]
    # Compact segments
    begins = cuda.device_array(shape=int(ct), dtype=np.intp)
    gpu_scatter_segment_begins.forall(markers.size)(markers, scanned, begins)
    return begins
Exemplo n.º 19
0
 def run_gather(self, arr, diffs):
     h_out_idx = np.zeros(1, dtype=np.intp)
     out_queue = cuda.device_array(shape=self._maxk, dtype=arr.dtype)
     gpu_insert_if_masked.forall(arr.size)(arr, diffs, h_out_idx, out_queue)
     qsz = h_out_idx[0]
     if self._maxk >= 0:
         if qsz > self._maxk:
             msg = 'too many unique value: unique values ({}) > k ({})'
             raise ValueError(msg.format(qsz, self._maxk))
         end = min(qsz, self._maxk)
     else:
         raise NotImplementedError('k is unbounded')
     vals = out_queue[:end]
     return vals
Exemplo n.º 20
0
 def append(self, other):
     """Append another column
     """
     if self.has_null_mask or other.has_null_mask:
         raise NotImplementedError("append masked column is not supported")
     newsize = len(self) + len(other)
     # allocate memory
     mem = cuda.device_array(shape=newsize, dtype=self.data.dtype)
     newbuf = Buffer.from_empty(mem)
     # copy into new memory
     for buf in [self.data, other.data]:
         newbuf.extend(buf.to_gpu_array())
     # return new column
     return self.replace(data=newbuf)
Exemplo n.º 21
0
    def test_event_elapsed_stream(self):
        N = 32
        stream = cuda.stream()
        dary = cuda.device_array(N, dtype=np.double)
        evtstart = cuda.event()
        evtend = cuda.event()

        evtstart.record(stream=stream)
        cuda.to_device(np.arange(N), to=dary, stream=stream)
        evtend.record(stream=stream)
        evtend.wait(stream=stream)
        evtend.synchronize()
        # Exercise the code path
        evtstart.elapsed_time(evtend)
Exemplo n.º 22
0
 def run(self, arr, k):
     if k >= MAX_FAST_UNIQUE_K:
         raise NotImplementedError('k >= {}'.format(MAX_FAST_UNIQUE_K))
     # setup mem
     outsz_ptr = cuda.device_array(shape=1, dtype=np.intp)
     out = cuda.device_array_like(arr)
     # kernel
     self._kernel[1, 64](arr, k, out, outsz_ptr)
     # copy to host
     unique_ct = outsz_ptr.copy_to_host()[0]
     if unique_ct < 0:
         raise ValueError('too many unique value (hint: increase k)')
     else:
         hout = out.copy_to_host()
         return hout[:unique_ct]
Exemplo n.º 23
0
def _getOccupancyCUDA(coords, centers, channelsigmas, trunc=5, device=0, resD=None, asnumpy=True, threadsperblock=256):
    #cuda.select_device(device)
    if resD is None:
        resD = cuda.device_array((centers.shape[0], channelsigmas.shape[1]), dtype=np.float32)
    _memsetArray(resD, val=0)

    natomblocks = int(np.ceil(coords.shape[0] / threadsperblock))
    blockspergrid = (centers.shape[0], natomblocks)

    centers = cuda.to_device(centers)
    coords = cuda.to_device(coords)
    channelsigmas = cuda.to_device(channelsigmas)
    _getOccupancyCUDAkernel[blockspergrid, threadsperblock](resD, coords, centers, channelsigmas, trunc * trunc)

    if asnumpy:
        return resD.copy_to_host()
Exemplo n.º 24
0
    def test_ufunc_arg(self):
        @vectorize(['f8(f8, f8)'], target='cuda')
        def vadd(a, b):
            return a + b

        # Case 1: use custom array as argument
        h_arr = np.random.random(10)
        arr = MyArray(cuda.to_device(h_arr))
        val = 6
        out = vadd(arr, val)
        np.testing.assert_array_equal(out.copy_to_host(), h_arr + val)

        # Case 2: use custom array as return
        out = MyArray(cuda.device_array(h_arr.shape))
        returned = vadd(h_arr, val, out=out)
        np.testing.assert_array_equal(returned.copy_to_host(), h_arr + val)
Exemplo n.º 25
0
    def test_gufunc_arg(self):
        @guvectorize(['(f8, f8, f8[:])'], '(),()->()', target='cuda')
        def vadd(inp, val, out):
            out[0] = inp + val

        # Case 1: use custom array as argument
        h_arr = np.random.random(10)
        arr = MyArray(cuda.to_device(h_arr))
        val = np.float64(7)
        out = vadd(arr, val)
        np.testing.assert_array_equal(out.copy_to_host(), h_arr + val)

        # Case 2: use custom array as return
        out = MyArray(cuda.device_array(h_arr.shape))
        returned = vadd(h_arr, val, out=out)
        np.testing.assert_array_equal(returned.copy_to_host(), h_arr + val)
        self.assertEqual(returned.device_ctypes_pointer.value,
                         out._arr.device_ctypes_pointer.value)
Exemplo n.º 26
0
    def test_gufunc_stream(self):

        @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])],
                     '(m,n),(n,p)->(m,p)',
                     target='cuda')
        def matmulcore(A, B, C):
            m, n = A.shape
            n, p = B.shape
            for i in range(m):
                for j in range(p):
                    C[i, j] = 0
                    for k in range(n):
                        C[i, j] += A[i, k] * B[k, j]

        gufunc = matmulcore
        gufunc.max_blocksize = 512

        #cuda.driver.flush_pending_free()
        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        ts = time()
        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        tcuda = time() - ts

        ts = time()
        Gold = ut.matrix_multiply(A, B)
        tcpu = time() - ts

        stream_speedups.append(tcpu / tcuda)

        self.assertTrue(np.allclose(C, Gold))
Exemplo n.º 27
0
def cuda_dot2(a, b):
    if a.shape[1] != b.shape[0]:
        raise ValueError('shape %s does not match to shape %s' %
                         (str(a.shape), str(b.shape)))
    try:
        c = np.ones((a.shape[0], b.shape[1]), dtype=a.dtype)
        TPB = 16
        threadsperblock = (TPB, TPB)
        blockspergrid_x = math.ceil(a.shape[0] / threadsperblock[0])
        blockspergrid_y = math.ceil(b.shape[1] / threadsperblock[1])
        blockspergrid = (blockspergrid_x, blockspergrid_y)
        da = cuda.to_device(a)
        db = cuda.to_device(b)
        dc = cuda.device_array((a.shape[0], b.shape[1]), dtype=a.dtype)
        cuda_dot_kernel2[blockspergrid, threadsperblock](da, db, dc)
        c = dc.copy_to_host()
        return c
    except:
        cuda.close()
        raise
Exemplo n.º 28
0
 def __init__(self, data, block_size, n_iter, seed=0):
     self.data = cuda.to_device(data)
     self.n_iter = n_iter
     
     # Parameters for the kernel launch 
     self.block_size = block_size
     self.n_samples = data.shape[0]
     self.n_blocks = self.n_samples // block_size
     
     # Allocate an output array on the GPU
     self.output = cuda.device_array((n_iter,self.n_blocks,2))
     
     # Create random number generators for each thread
     # NOTE: The threads within the same block should generate the same random numbers
     rng_states = np.empty(self.n_samples, dtype=xoroshiro128p_dtype)
     for i in range(self.n_samples):
         init_xoroshiro128p_state(rng_states, i, seed)  # Init to a fixed state
         for j in range(i//block_size):  # Jump forward block_index*2^64 steps
             xoroshiro128p_jump(rng_states, i)
     self.rng_states = cuda.to_device(rng_states)  # Copy it to the GPU
Exemplo n.º 29
0
def test_sum(dtype, nelem):
    data = gen_rand(dtype, nelem)
    d_data = cuda.to_device(data)
    d_result = cuda.device_array(libgdf.gdf_reduce_optimal_output_size(),
                                 dtype=d_data.dtype)

    col_data = new_column()
    gdf_dtype = get_dtype(dtype)

    libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem,
                           gdf_dtype)

    libgdf.gdf_sum_generic(col_data, unwrap_devary(d_result), d_result.size)
    got = d_result.copy_to_host()[0]
    expect = data.sum()

    print('expect:', expect)
    print('got:', got)

    np.testing.assert_array_almost_equal(expect, got)
Exemplo n.º 30
0
 def neighbour_list(self):
     with cuda.gpus[self.gpu]:
         while True:
             cu_set_to_int[self.bpg, self.tpb](self.d_nc, 0)
             # reset situation while build nlist
             self.cu_nlist[self.bpg, self.tpb](
                 self.system.d_x, self.d_last_x, self.system.d_box,
                 self.r_cut2, self.clist.d_cell_map, self.clist.d_cell_list,
                 self.clist.d_cell_counts, self.clist.d_cells, self.d_nl,
                 self.d_nc, self.d_n_max, self.d_situation)
             self.d_n_max.copy_to_host(self.p_n_max)
             cuda.synchronize()
             # n_max = np.array([120])
             if self.p_n_max[0] > self.n_guess:
                 self.n_guess = self.p_n_max[0]
                 self.n_guess = self.n_guess + 8 - (self.n_guess & 7)
                 self.d_nl = cuda.device_array(
                     (self.system.N, self.n_guess), dtype=np.int32)
             else:
                 break
Exemplo n.º 31
0
def cuda_conv(x,K) :    
        B=x.shape[0]
        H=x.shape[1]
        W=x.shape[2]
        k=K.shape[0]

        Cout=K.shape[-1]
        A_global_mem = cuda.to_device(x)
        B_global_mem = cuda.to_device(K)
        C_global_mem = cuda.device_array((B,H-k+1,W-k+1,Cout))
        # threadsperblock = (,,1)
        # blockspergrid =(1,1,)

        threadsperblock = (16,16,4)
        blockspergrid =(int(math.ceil((H-k+1) / threadsperblock[0])),
                        int(math.ceil((W-k+1) / threadsperblock[1])),
                        int(math.ceil(Cout / threadsperblock[2])))

        conv[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
        return C_global_mem.copy_to_host()
Exemplo n.º 32
0
    def process(self, inputs):
        df = inputs['points_df_in']

        # DEBUGGING
        # try:
        #     from dask.distributed import get_worker
        #     worker = get_worker()
        #     print('worker{} process NODE "{}" worker: {}'.format(
        #         worker.name, self.uid, worker))
        # except (ValueError, ImportError):
        #     pass

        number_of_threads = 16
        number_of_blocks = ((len(df) - 1) // number_of_threads) + 1
        # Inits device array by setting 0 for each index.
        darr = cuda.device_array(len(df))
        distance_kernel[(number_of_blocks, ),
                        (number_of_threads, )](df['x'], df['y'], darr, len(df))
        df['distance_numba'] = darr
        return {'distance_df': df}
def cuda_operation():
    """Performs Vectorized Operations on GPU"""

    x = real_estate_array()
    y = real_estate_array()

    print("Moving calculations to GPU memory")
    x_device = cuda.to_device(x)
    y_device = cuda.to_device(y)
    out_device = cuda.device_array(
        shape=(x_device.shape[0],x_device.shape[1]), dtype=np.float32)
    print(x_device)
    print(x_device.shape)
    print(x_device.dtype)

    print("Calculating on GPU")
    add_ufunc(x_device,y_device, out=out_device)

    out_host = out_device.copy_to_host()
    print(f"Calculations from GPU {out_host}")
Exemplo n.º 34
0
 def update(self):
     with cuda.gpus[self.gpu]:
         while True:
             cu_set_to_int[self.bpg_cell, self.tpb](self.d_cell_counts, 0)
             cu_cell_list[self.bpg,
                          self.tpb](self.system.d_x, self.system.d_box,
                                    self.d_ibox, self.d_cell_list,
                                    self.d_cell_counts, self.d_cells,
                                    self.d_cell_max)
             self.d_cell_max.copy_to_host(self.p_cell_max)
             cuda.synchronize()
             if self.p_cell_max[0] > self.cell_guess:
                 self.cell_guess = self.p_cell_max[0]
                 self.cell_guess = self.cell_guess + 8 - (self.cell_guess
                                                          & 7)
                 self.d_cell_list = cuda.device_array(
                     (self.n_cell, self.cell_guess, self.system.n_dim + 1),
                     dtype=self.system.dtype)
             else:
                 break
Exemplo n.º 35
0
    def forward(self, input):
        batch_num, _, height, width = input.shape

        H_out = int((height + 2 * self.padding - self.weight.shape[2]) /
                    self.stride) + 1
        W_out = int((width + 2 * self.padding - self.weight.shape[3]) /
                    self.stride) + 1
        output_shape = (batch_num, self.out_channels, H_out, W_out)

        d_output = cuda.device_array(output_shape, dtype=np.float32)

        blockdim = (10, 10, 10)
        griddim_0 = ceil(batch_num * self.out_channels / blockdim[0])
        griddim_1 = ceil(H_out / blockdim[1])
        griddim_2 = ceil(W_out / blockdim[2])
        griddim = (griddim_0, griddim_1, griddim_2)
        conv_step_gpu[griddim,
                      blockdim](input, d_output, self.d_weight, self.stride,
                                self.mod, self.filter_size, self.padding)
        return d_output
Exemplo n.º 36
0
def prefixsum(vals):
    """Compute the full prefixsum.

    Given the input of N.  The output size is N + 1.
    The first value is always 0.  The last value is the sum of *vals*.
    """
    from . import _gdf

    # Allocate output
    slots = cuda.device_array(shape=vals.size + 1,
                              dtype=vals.dtype)
    # Fill 0 to slot[0]
    gpu_fill_value[1, 1](slots[:1], 0)

    # Compute prefixsum on the mask
    _gdf.apply_prefixsum(_gdf.columnview_from_devary(vals),
                         _gdf.columnview_from_devary(slots[1:]),
                         inclusive=True)

    return slots
Exemplo n.º 37
0
def query_execute(df, expr, callenv):
    """Compile & execute the query expression

    Note: the expression is compiled and cached for future reuse.

    Parameters
    ----------
    df : DataFrame
    expr : str
        boolean expression
    callenv : dict
        Contains keys 'locals' and 'globals' which are both dict.
        They represent the local and global dictionaries of the caller.
    """
    # compile
    compiled = query_compile(expr)
    kernel = compiled['kernel']
    # process env args
    envargs = []
    envdict = callenv['globals'].copy()
    envdict.update(callenv['locals'])
    for name in compiled['refnames']:
        name = name[len(ENVREF_PREFIX):]
        try:
            val = envdict[name]
            if isinstance(val, dt.datetime):
                val = np.datetime64(val)
        except KeyError:
            msg = '{!r} not defined in the calling environment'
            raise NameError(msg.format(name))
        else:
            envargs.append(val)
    # prepare col args
    colarrays = [df[col].to_gpu_array() for col in compiled['colnames']]
    # allocate output buffer
    nrows = len(df)
    out = cuda.device_array(nrows, dtype=np.bool_)
    # run kernel
    args = [out] + colarrays + envargs
    kernel.forall(nrows)(*args)
    return out
Exemplo n.º 38
0
def test_sum_masked(nelem):
    dtype = np.float64
    data = gen_rand(dtype, nelem)
    mask = gen_rand(np.int8, (nelem + 8 - 1) // 8)

    d_data = cuda.to_device(data)
    d_mask = cuda.to_device(mask)
    d_result = cuda.device_array(libgdf.gdf_reduce_optimal_output_size(),
                                 dtype=d_data.dtype)

    col_data = new_column()
    gdf_dtype = get_dtype(dtype)
    libgdf.gdf_column_view(col_data, unwrap_devary(d_data),
                           unwrap_devary(d_mask), nelem, gdf_dtype)
    libgdf.gdf_sum_generic(col_data, unwrap_devary(d_result), d_result.size)

    got = d_result.copy_to_host()[0]
    boolmask = buffer_as_bits(mask)[:nelem]
    expect = data[boolmask].sum()

    np.testing.assert_almost_equal(expect, got)
Exemplo n.º 39
0
def main(image1, image2):
    data_image1 = np.array(image1)
    data_image2 = np.array(image2)
    print(data_image1.shape, data_image2.shape)

    threadsperblock = (16, 16, 4)
    blocksper_x = int(math.ceil(data_image1.shape[0] // threadsperblock[0]))
    blocksper_y = int(math.ceil(data_image1.shape[1] // threadsperblock[1]))
    blocksper_z = int(math.ceil(data_image1.shape[2] // threadsperblock[2]))
    blockspergrid = (blocksper_x, blocksper_y, blocksper_z)

    input1 = cuda.to_device(data_image1)
    input2 = cuda.to_device(data_image2)
    output = cuda.device_array(data_image1.shape)

    sumImages[blockspergrid, threadsperblock](input1, input2, output)

    out = output.copy_to_host()
    out = out.astype('uint8')
    out = Image.fromarray(out)
    out.save("out.png")
Exemplo n.º 40
0
def distance_cal(x, y):

    Nx = x.size
    Ny = y.size

    points = [[1, 0], [-1, 0], [0, 1]]

    d_x = cuda.to_device(x)
    d_y = cuda.to_device(y)
    d_points = cuda.to_device(np.array(points))
    d_out = cuda.device_array((Nx, Ny))

    TPBX = 8
    TPBY = 8

    gridDims = ((Nx + TPBX - 1) // TPBX, (Ny + TPBY - 1) // TPBY)
    blockDims = (TPBX, TPBY)

    distance_kernel[gridDims, blockDims](d_x, d_y, d_points, d_out)

    return d_out.copy_to_host()
Exemplo n.º 41
0
def cu_mat_2d_to_4d(A, dim, from_host=False, to_host=False):
    """
    :param A:
    :param from_host:
    :param to_host:
    :return: A:2d matrix, shape: A's shape
    """
    if from_host:
        A = cuda.to_device(A.astype(np.float32))

    assert len(A.shape) == 2 and A.shape[0] == dim[0]

    res = cuda.device_array(shape=(dim[0], dim[1], dim[2], dim[3]),
                            dtype=np.float32)
    grid_dim, block_dim = auto_detect(A.shape)
    _cu_mat_2d_to_4d[grid_dim, block_dim](A, res)

    if to_host:
        return res.copy_to_host(), A.shape
    else:
        return res, A.shape
Exemplo n.º 42
0
def WT_as_f(x, k, c, L):
    # fund. frequency
    k1 = 2.0 * np.pi / L

    # Set up kernel
    blockSize = (TPB, TPB)
    numBlocksX = (x.shape[0] + blockSize[0] - 1) // blockSize[0]
    numBlocksK = (k.shape[0] + blockSize[1] - 1) // blockSize[1]
    numBlocks = (numBlocksX, numBlocksK)

    # output on device
    dW = cuda.device_array((x.shape[0], k.shape[0]), np.complex128)
    # input
    dC = cuda.to_device(np.ascontiguousarray(c))
    dx = cuda.to_device(x)
    dk = cuda.to_device(k)

    # call kernel
    wignerKernel[numBlocks, blockSize](dC, dx, dk, dW, k1)

    return 2.0 * dW.copy_to_host()
Exemplo n.º 43
0
def relu(input, device=None):
    if device is (None or 'gpu'):
        if cuda.is_available():
            device = 'gpu'
        else:
            device = 'cpu'
    if device is 'gpu':
        batch_num, channels, height, width = input.shape

        d_output = cuda.device_array(input.shape, dtype=np.float32)

        blockdim = (10, 10, 10)
        griddim_0 = ceil(batch_num * channels / blockdim[0])
        griddim_1 = ceil(height / blockdim[1])
        griddim_2 = ceil(width / blockdim[2])
        griddim = (griddim_0, griddim_1, griddim_2)
        relu_gpu[griddim, blockdim](input, d_output)

        return d_output
    else:
        return relu_cpu(input)
def NNGPU_class(X, W, config):
    maxNeuronas = max(config)
    config = np.append(X.shape[0], config[:])

    # Se inicializa la matriz de pesos
    maxPesos = max(config)
    threadsPerBlock = maxNeuronas  # en cada bloque se calcula una neurona
    blocksPerGrid = maxNeuronas
    # Se mueven los datos necesarios para el GPU
    Wg = cuda.to_device(W)
    Xg = cuda.to_device(X)

    configG = cuda.to_device(config)
    output = cuda.device_array([config.shape[0], config.max()])

    NNCUDA_class[blocksPerGrid, threadsPerBlock](Xg, Wg, configG, output)

    ret = output.copy_to_host()
    #P = Wg.copy_to_host()
    #print (P)
    return ret[-1]
Exemplo n.º 45
0
def apply_segsort(col_keys, col_vals, segments, descending=False):
    """Inplace segemented sort

    Parameters
    ----------
    col_keys : Column
    col_vals : Column
    segments : device array
    """
    # prepare
    nelem = len(col_keys)
    seg_dtype = np.uint32

    d_fullsegs = cuda.device_array(segments.size + 1, dtype=seg_dtype)
    d_begins = d_fullsegs[:-1]
    d_ends = d_fullsegs[1:]

    # Note: .astype is required below because .copy_to_device
    #       is just a plain memcpy
    d_begins.copy_to_device(cudautils.astype(segments, dtype=seg_dtype))
    d_ends[-1:].copy_to_device(np.require([nelem], dtype=seg_dtype))

    begin_bit = 0
    end_bit = col_keys.dtype.itemsize * 8

    sizeof_key = col_keys.data.dtype.itemsize
    sizeof_val = col_vals.data.dtype.itemsize

    # sort
    plan = libgdf.gdf_segmented_radixsort_plan(nelem, descending, begin_bit,
                                               end_bit)
    try:
        libgdf.gdf_segmented_radixsort_plan_setup(plan, sizeof_key, sizeof_val)
        libgdf.gdf_segmented_radixsort_generic(plan, col_keys.cffi_view,
                                               col_vals.cffi_view,
                                               segments.size,
                                               unwrap_devary(d_begins),
                                               unwrap_devary(d_ends))
    finally:
        libgdf.gdf_segmented_radixsort_plan_free(plan)
Exemplo n.º 46
0
def condense(u, buffer):
    #u = input voxel model
    #buffer = number of layers of voxels around the boundaries that are left empty
    #Outputs a new matrix that is fitted to the input voxel model, removing layers
    #that don't store geometry.
    m, n, p = u.shape
    TPBX, TPBY, TPBZ = TPB, TPB, TPB
    minX, maxX, minY, maxY, minZ, maxZ = -1, -1, -1, -1, -1, -1
    i, j, k = 0, 0, 0
    while minX < 0:
        if np.amin(u[i, :, :]) < 0: minX = i
        else: i += 1
    while minY < 0:
        if np.amin(u[:, j, :]) < 0: minY = j
        else: j += 1
    while minZ < 0:
        if np.amin(u[:, :, k]) < 0: minZ = k
        else: k += 1
    i, j, k = 1, 1, 1
    while maxX < 0:
        if np.amin(u[m - i, :, :]) < 0: maxX = m - i
        else: i += 1
    while maxY < 0:
        if np.amin(u[:, n - j, :]) < 0: maxY = n - j
        else: j += 1
    while maxZ < 0:
        if np.amin(u[:, :, p - k]) < 0: maxZ = p - k
        else: k += 1
    xSize = (np.ceil((2 * buffer + maxX - minX) / TPB) * TPB).astype(int)
    ySize = (np.ceil((2 * buffer + maxY - minY) / TPB) * TPB).astype(int)
    zSize = (np.ceil((2 * buffer + maxZ - minZ) / TPB) * TPB).astype(int)
    d_u = cuda.to_device(u)
    d_uCondensed = cuda.device_array(shape=[xSize, ySize, zSize],
                                     dtype=np.float32)
    gridDims = (xSize + TPBX - 1) // TPBX, (ySize + TPBY - 1) // TPBY, (
        zSize + TPBZ - 1) // TPBZ
    blockDims = TPBX, TPBY, TPBZ
    condenseKernel[gridDims, blockDims](d_u, d_uCondensed, buffer, minX, minY,
                                        minZ)
    return d_uCondensed.copy_to_host()
Exemplo n.º 47
0
def AveragesOnShellsInnerLogicCCuda(\
                                    retNowR_global_mem,\
                                    retNowI_global_mem,\
                                    n1ofR_global_mem,\
                                    n2ofR_global_mem,\
                                    NumAtROutPre_global_mem,\
                                    End,\
                                    Start,\
                                    NumOnSurf,\
                                    r):

    threadsperblock = (1024,1,1)
    blockspergrid_x = int(math.ceil(retNowR_global_mem[r][:NumOnSurf].shape[0]/threadsperblock[0]))
    blockspergrid = (blockspergrid_x,1,1)
    # set up stream
    stream = cuda.stream()
    device_array_start = time.time()
    reduced_global_mem = cuda.device_array((4,(End-Start)))
    print("Shape of NumAtROutPre_global_mem is ",NumAtROutPre_global_mem.shape)
    print("Shape of reduced_global_mem is ",reduced_global_mem.shape)
    #print("Time to create cuda.device_array is ",time.time() - device_array_start)
    filter_time = time.time()
    cuda_kernels.filter_and_sum[threadsperblock,blockspergrid](\
        retNowR_global_mem,\
        retNowI_global_mem,\
        n1ofR_global_mem,\
        n2ofR_global_mem,\
        NumAtROutPre_global_mem,\
        reduced_global_mem,\
        End,\
        Start,\
        NumOnSurf,\
        r)
    stream.synchronize()
    print("Time to complete filter_and_sum is ",time.time() - filter_time)
    reduced_start = time.time()
    reduced = reduced_global_mem.copy_to_host()
    print("Time to transfer reduced to host is ",time.time() - reduced_start)

    return reduced
Exemplo n.º 48
0
def cuda_deposition_arrays(Nz = None, Nr = None, fieldtype = None):
    """
    Create empty arrays on the GPU for the charge and
    current deposition in each of the 4 possible direction.

    ###########################################
    # Needs to be moved to the fields package!
    ###########################################

    Parameters
    ----------
    Nz : int
        Number of cells in z.
    Nr : int
        Number of cells in r.

    fieldtype : string
        Either 'rho' or 'J'.
    """
    # Create empty arrays to store the four different possible
    # cell directions a particle can deposit to.
    if fieldtype == 'rho':
        # Rho - third dimension represents 2 modes
        rho0 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
        rho1 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
        rho2 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
        rho3 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
        return rho0, rho1, rho2, rho3

    if fieldtype == 'J':
        # J - third dimension represents 2 modes
        # times 3 dimensions (r, t, z)
        J0 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
        J1 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
        J2 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
        J3 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
        return J0, J1, J2, J3
Exemplo n.º 49
0
def copy_to_dense(data, mask, out=None):
    """Copy *data* with validity bits in *mask* into *out*.

    The output array can be specified in `out`.

    Return a 2-tuple of:
    * number of non-null element
    * a dense gpu array given the data and mask gpu arrays.
    """
    slots, sz = mask_assign_slot(size=data.size, mask=mask)
    if out is None:
        # output buffer is not provided
        # allocate one
        alloc_shape = sz
        out = cuda.device_array(shape=alloc_shape, dtype=data.dtype)
    else:
        # output buffer is provided
        # check it
        if sz >= out.size:
            raise ValueError('output array too small')
    gpu_copy_to_dense.forall(data.size)(data, mask, slots, out)
    return (sz, out)
Exemplo n.º 50
0
def apply_label(arr, cats, dtype, na_sentinel):
    """
    Parameters
    ----------
    arr : device array
        data
    cats : device array
        Unique category value
    dtype : np.dtype
        output array dtype
    na_sentinel : int
        Value to indicate missing value
    Returns
    -------
    result : device array
    """
    encs = np.asarray(list(range(cats.size)))
    d_encs = to_device(encs)
    out = cuda.device_array(shape=arr.size, dtype=dtype)
    configured = gpu_label.forall(out.size)
    configured(arr, cats, d_encs, na_sentinel, out)
    return out
Exemplo n.º 51
0
    def test_gufunc_stream(self):
        @guvectorize(
            [void(float32[:, :], float32[:, :], float32[:, :])],
            "(m,n),(n,p)->(m,p)",
            target="cuda",
        )
        def matmulcore(A, B, C):
            m, n = A.shape
            n, p = B.shape
            for i in range(m):
                for j in range(p):
                    C[i, j] = 0
                    for k in range(n):
                        C[i, j] += A[i, k] * B[k, j]

        gufunc = matmulcore
        gufunc.max_blocksize = 512

        # cuda.driver.flush_pending_free()
        matrix_ct = 1001  # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4,
                      dtype=np.float32).reshape(matrix_ct, 2, 4)
        B = np.arange(matrix_ct * 4 * 5,
                      dtype=np.float32).reshape(matrix_ct, 4, 5)

        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5),
                               dtype=A.dtype,
                               stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        Gold = ut.matrix_multiply(A, B)

        self.assertTrue(np.allclose(C, Gold))
Exemplo n.º 52
0
def softmax_backprop_kernel_wrapper(d_d_L_d_out,
                                    d_weight,
                                    d_maxpoolOutput,
                                    d_postSoftmax,
                                    numImage,
                                    d_d_L_d_input,
                                    d_d_L_d_w,
                                    d_d_L_d_b,
                                    blockSize=(32, 32)):
    # Tính d_d_L_d_preSoftmax cũng là tính d_d_L_d_b vì nó trỏ cùng 1 vùng nhớ
    d_d_L_d_preSoftmax = d_d_L_d_b
    gridSize = math.ceil(numImage / blockSize[1])
    softmax_backprop_kernel[gridSize,
                            blockSize[1]](d_d_L_d_out, d_postSoftmax, numImage,
                                          d_d_L_d_preSoftmax)

    # Tính d_d_L_d_w
    d_d_L_d_preSoftmaxReshape = d_d_L_d_preSoftmax[:numImage].reshape(
        numImage, d_d_L_d_preSoftmax.shape[1], 1)
    d_maxpoolOutputReshape = d_maxpoolOutput[:numImage].reshape(
        numImage, 1, d_weight.shape[1])
    gridSize = (math.ceil(d_maxpoolOutputReshape.shape[2] / blockSize[0]),
                math.ceil(d_d_L_d_preSoftmaxReshape.shape[1] / blockSize[1]),
                d_d_L_d_preSoftmaxReshape.shape[0])
    dot_3D_kernel[gridSize, blockSize](d_d_L_d_preSoftmaxReshape,
                                       d_maxpoolOutputReshape, d_d_L_d_w)

    # Tính d_d_L_d_input
    d_d_L_d_input_temp = cuda.device_array((numImage, 1, d_weight.shape[1]),
                                           dtype=float)
    d_d_L_d_preSoftmaxReshape = d_d_L_d_preSoftmax[:numImage].reshape(
        numImage, 1, d_d_L_d_preSoftmax.shape[1])
    gridSize = (math.ceil(d_d_L_d_input_temp.shape[2] / blockSize[0]),
                math.ceil(d_d_L_d_input_temp.shape[1] / blockSize[1]),
                d_d_L_d_input_temp.shape[0])
    dot_3D2D_kernel[gridSize, blockSize](d_d_L_d_preSoftmaxReshape, d_weight,
                                         d_d_L_d_input_temp)
    d_d_L_d_input[0, :numImage] = d_d_L_d_input_temp[:numImage].reshape(
        d_maxpoolOutput[:numImage].shape)
Exemplo n.º 53
0
def matmul(A, B, matmultype='forward'):
    global PARALLELIZE
    if not PARALLELIZE:
        # NORMAL
        gradient = np.zeros((A.shape[0], B.shape[1]))
        for i in range(A.shape[0]):
            for j in range(B.shape[1]):
                for k in range(A.shape[1]):
                    gradient[i][j] += A[i][k] * B[k][j]
        return gradient

    # PARALLELIZED
    global global_feat
    global global_feat_val
    global global_feat_transpose
    B_global_mem = cuda.to_device(B)
    C_global_mem = cuda.device_array((A.shape[0], B.shape[1]))

    # Configure the blocks
    threadsperblock = (TPB, TPB)
    blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[1]))
    blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[0]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    # Start the kernel
    if matmultype == 'forward':
        matmul_kernel[blockspergrid,
                      threadsperblock](global_feat, B_global_mem, C_global_mem)
    elif matmultype == 'validation':
        matmul_kernel[blockspergrid,
                      threadsperblock](global_feat_val, B_global_mem,
                                       C_global_mem)
    elif matmultype == 'backward':
        matmul_kernel[blockspergrid,
                      threadsperblock](global_feat_transpose, B_global_mem,
                                       C_global_mem)

    res = C_global_mem.copy_to_host()
    return res
Exemplo n.º 54
0
def main():
    n = 20000000
    x = np.arange(n).astype(np.int32)
    y = 2 * x

    x_device = cuda.to_device(x)
    y_device = cuda.to_device(y)
    out_device = cuda.device_array(n)

    threads_per_block = 1024
    blocks_per_grid = math.ceil(n / threads_per_block)
    start = time()
    gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, out_device,
                                                n)
    #gpu_result = out_device.copy_to_host()
    cuda.synchronize()
    print("gpu vector add time " + str(time() - start))
    start = time()
    gpu_add_stride[80, threads_per_block](x_device, y_device, out_device, n)
    #gpu_result = out_device.copy_to_host()
    cuda.synchronize()
    print("gpu stride vector add time " + str(time() - start))
Exemplo n.º 55
0
def query_execute(df, expr, callenv):
    """Compile & execute the query expression

    Note: the expression is compiled and cached for future reuse.

    Parameters
    ----------
    df : DataFrame
    expr : str
        boolean expression
    callenv : dict
        Contains keys 'locals' and 'globals' which are both dict.
        They represent the local and global dictionaries of the caller.
    """
    # compile
    compiled = query_compile(expr)
    kernel = compiled['kernel']
    # process env args
    envargs = []
    envdict = callenv['globals'].copy()
    envdict.update(callenv['locals'])
    for name in compiled['refnames']:
        name = name[len(ENVREF_PREFIX):]
        try:
            val = envdict[name]
        except KeyError:
            msg = '{!r} not defined in the calling environment'
            raise NameError(msg.format(name))
        else:
            envargs.append(val)
    # prepare col args
    colarrays = [df[col].to_gpu_array() for col in compiled['colnames']]
    # allocate output buffer
    nrows = len(df)
    out = cuda.device_array(nrows, dtype=np.bool_)
    # run kernel
    args = [out] + colarrays + envargs
    kernel.forall(nrows)(*args)
    return out
Exemplo n.º 56
0
def copy_to_dense(data, mask, out=None):
    """Copy *data* with validity bits in *mask* into *out*.

    The output array can be specified in `out`.

    Return a 2-tuple of:
    * number of non-null element
    * a dense gpu array given the data and mask gpu arrays.
    """
    slots, sz = mask_assign_slot(size=data.size, mask=mask)
    if out is None:
        # output buffer is not provided
        # allocate one
        alloc_shape = sz
        out = cuda.device_array(shape=alloc_shape, dtype=data.dtype)
    else:
        # output buffer is provided
        # check it
        if sz >= out.size:
            raise ValueError('output array too small')
    gpu_copy_to_dense.forall(data.size)(data, mask, slots, out)
    return (sz, out)
Exemplo n.º 57
0
def matrix_mult(m1, m2):
    A_global_mem = cuda.to_device(m1)
    B_global_mem = cuda.to_device(m2)

    # Allocate memory on the device for the result
    C_global_mem = cuda.device_array(
        (A_global_mem.shape[0], B_global_mem.shape[1]))

    # Configure the blocks
    threadsperblock = (32, 32)
    blockspergrid_x = int(math.ceil(m1.shape[0] / threadsperblock[0]))
    blockspergrid_y = int(math.ceil(m2.shape[1] / threadsperblock[1]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    # # Start the kernel
    matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem,
                                           C_global_mem)
    # fast_matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
    #out = matmul_cuda(A_global_mem, B_global_mem)

    # Copy the result back to the host
    return C_global_mem.copy_to_host()
Exemplo n.º 58
0
def host_naive(A, B):
    '''host code for calling naive kernal
    '''
    A = np.array(A)
    B = np.array(B)

    m = A.shape[0]
    n = B.shape[1]
    C = np.full((m, n), 0, dtype=np.float64)

    d_A = cuda.to_device(A)  # d_ --> device
    d_B = cuda.to_device(B)
    d_C = cuda.device_array(C.shape, np.float64)

    threadsperblock = (TPB, TPB)
    blockspergrid_x = math.ceil(A.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(B.shape[1] / threadsperblock[1])
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    mat_mul_naive_kernal[blockspergrid, threadsperblock](d_A, d_B, d_C)

    return d_C.copy_to_host()
Exemplo n.º 59
0
    def __init__(self, shape, blocks, coupling=None, temperature=0, field=0):
        """

        :param shape: number of spins along each axis
        :param blocks: size of subdivisions along each axis, for parallel
            computation
        :param coupling: coupling array, must be (2*r+1)x...x(2*r+1) with
            (r,...,r) denoting the center.
        :param temperature: unitless temperature
        :param field: unitless applied field
        """
        self.num_dim = len(shape)
        self.shape = np.array(shape, dtype=np.int32)
        self.num_spins = np.prod(self.shape)
        self.blocks = np.array(blocks, dtype=np.int32)

        if np.any(self.shape & (self.shape - 1)):
            raise ValueError("Shape must consist of powers of 2,")
        if self.shape[-1] % 8:
            raise ValueError("Last shape must be multiple of 8")
        if np.any(self.shape % self.blocks):
            raise ValueError("Blocks do not evenly divide shape")

        self.shape_shifts = cuda.to_device(ncil.log2(self.shape))
        self.block_shifts = cuda.to_device(ncil.log2(self.blocks))
        self.spins = cuda.device_array((self.num_spins // 8, ), np.uint8)
        self.spins[:] = np.random.randint(0,
                                          256,
                                          size=self.num_spins >> 3,
                                          dtype=np.uint8)

        if coupling is None:
            coupling = np.empty(np.zeros(self.num_dim), dtype=np.float64)
        self.coupling_indices = cuda.to_device(
            np.vstack(np.where(coupling != 0.0)).T - (coupling.shape[0] // 2))
        self.coupling_constants = cuda.to_device(
            coupling[np.where(coupling != 0.0)])
        self.temperature = temperature
        self.field = field
def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    # Instantiate cuRAND PRNG
    prng = PRNG(PRNG.MRG32K3A)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Simulation loop
    d_last = cuda.to_device(paths[:, 0])
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j])
        step(d_last, dt, c0, c1, d_normdist, out=d_paths)
        d_paths.copy_to_host(paths[:, j])
        d_last = d_paths