def _concat(cls, objs): head = objs[0] for o in objs: if not o.is_type_equivalent(head): raise ValueError("All series must be of same type") newsize = sum(map(len, objs)) # Concatenate data mem = cuda.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem) for o in objs: data.extend(o.data.to_gpu_array()) # Concatenate mask if present if all(o.has_null_mask for o in objs): # FIXME: Inefficient mem = cuda.device_array(shape=newsize, dtype=np.bool) mask = Buffer.from_empty(mem) null_count = 0 for o in objs: mask.extend(o._get_mask_as_series().to_gpu_array()) null_count += o._null_count mask = Buffer(utils.boolmask_to_bitmask(mask.to_array())) else: mask = None null_count = 0 col = head.replace(data=data, mask=mask, null_count=null_count) return col
def test_profiling(self): with cuda._profiling(): a = cuda.device_array(10) del a with cuda._profiling(): a = cuda.device_array(100) del a
def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None, MAX_TPB = 512, stream = None): """ All input (except MAX_TPB and stream) are device arrays. edges : array with the IDs of the edges that will be part of the new graph n_edges : array of 1 element with the number of valid edges in the edges array; if n_edges < size of edges, the last elements of the edges array are not considered """ # check if number of valid edges was received if n_edges is None: edges_size = edges.size n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32)) else: edges_size = int(n_edges.getitem(0)) # check if a stream was received, if not create one if stream is None: myStream = cuda.stream() else: myStream = stream new_n_edges = edges_size * 2 # allocate memory for new graph ndest = cuda.device_array(new_n_edges, dtype = dest.dtype, stream = myStream) nweight = cuda.device_array(new_n_edges, dtype = weight.dtype, stream = myStream) nfe = cuda.device_array_like(fe, stream = myStream) nod = cuda.device_array_like(od, stream = myStream) # fill new outdegree with zeros vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB) memSet[vertexGrid, MAX_TPB, myStream](nod, 0) # count all edges of new array and who they belong to edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB) countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod) # get new first_edge array from new outdegree nfe.copy_to_device(nod, stream=myStream) ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream) # copy new first_edge to top_edge to serve as pointer in adding edges top_edge = cuda.device_array_like(nfe, stream = myStream) top_edge.copy_to_device(nfe, stream = myStream) addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od, top_edge, ndest, nweight) del top_edge #del dest, weight, fe, od return ndest, nweight, nfe, nod
def arange(start, stop=None, step=1, dtype=np.int64): if stop is None: start, stop = 0, start size = (stop - start + (step - 1)) // step out = cuda.device_array(size, dtype=dtype) gpu_arange.forall(size)(start, size, step, out) return out
def gather(data, index, out=None): """Perform ``out = data[index]`` on the GPU """ if out is None: out = cuda.device_array(shape=index.size, dtype=data.dtype) gpu_gather.forall(index.size)(data, index, out) return out
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = PRNG(PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def test_gufunc_stream(self): #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def prescan_test(): a = np.arange(2048).astype(np.int32) reference = np.empty_like(a) ref_sum = scan.exprefixsumNumba(a, reference) a1 = np.arange(1024).astype(np.int32) a2 = np.arange(1024, 2048).astype(np.int32) ref1 = np.empty_like(a1) ref2 = np.empty_like(a2) ref_sum1 = scan.exprefixsumNumba(a1, ref1) ref_sum2 = scan.exprefixsumNumba(a2, ref2) dAux = cuda.device_array(2, dtype = np.int32) dA = cuda.to_device(a) sm_size = 1024 * a.dtype.itemsize scan.prescan[2, 512, 0, sm_size](dA, dAux) aux = dAux.copy_to_host() a_gpu = dA.copy_to_host() print "finish"
def sum_parts(data): """ Driver for ``gpu_single_block_sum`` kernel """ arr = np.asarray(data, dtype=np.float32) out = cuda.device_array(1, dtype=np.float32) gpu_single_block_sum[1, gpu_block_sum_max_blockdim](arr, out) return out.copy_to_host()[0]
def astype(ary, dtype): if ary.dtype == np.dtype(dtype): return ary else: out = cuda.device_array(shape=ary.shape, dtype=dtype) configured = gpu_copy.forall(out.size) configured(ary, out) return out
def apply_reduce(fn, inp): # allocate output+temp array outsz = libgdf.gdf_reduce_optimal_output_size() out = cuda.device_array(outsz, dtype=inp.dtype) # call reduction fn(inp.cffi_view, unwrap_devary(out), outsz) # return 1st element return out[0]
def __init__(self, shape, dtype, prealloc): self.device = cuda.get_current_device() self.freelist = deque() self.events = {} for i in range(prealloc): gpumem = cuda.device_array(shape=shape, dtype=dtype) self.freelist.append(gpumem) self.events[gpumem] = cuda.event(timing=False)
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [int(math.ceil(float(partlen) / blksz)) for partlen in partlens] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [PRNG(PRNG.MRG32K3A, stream=strm) for strm in strmlist] # Allocate device side array d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist)] c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist)] d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for j in range(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def test_stream_bind(self): stream = cuda.stream() with stream.auto_synchronize(): arr = cuda.device_array( (3, 3), dtype=np.float64, stream=stream) self.assertEqual(arr.bind(stream).stream, stream) self.assertEqual(arr.stream, stream)
def mask_assign_slot(size, mask): # expand bits into bytes dtype = (np.int32 if size < 2 ** 31 else np.int64) expanded_mask = cuda.device_array(size, dtype=dtype) numtasks = min(64 * 128, expanded_mask.size) gpu_expand_mask_bits.forall(numtasks)(mask, expanded_mask) # compute prefixsum slots = prefixsum(expanded_mask) sz = int(slots[slots.size - 1]) return slots, sz
def test_device_array_interface(self): dary = cuda.device_array(shape=100) devicearray.verify_cuda_ndarray_interface(dary) ary = np.empty(100) dary = cuda.to_device(ary) devicearray.verify_cuda_ndarray_interface(dary) ary = np.asarray(1.234) dary = cuda.to_device(ary) self.assertEquals(dary.ndim, 1) devicearray.verify_cuda_ndarray_interface(dary)
def test_event_elapsed(self): N = 32 dary = cuda.device_array(N, dtype=np.double) evtstart = cuda.event() evtend = cuda.event() evtstart.record() cuda.to_device(np.arange(N), to=dary) evtend.record() evtend.wait() evtend.synchronize() print(evtstart.elapsed_time(evtend))
def find_segments(arr): """Find beginning indices of runs of equal values. Returns ------- starting_indices : device array The starting indices of start of segments. Total segment count will be equal to the length of this. """ from . import _gdf # Compute diffs of consecutive elements markers = cuda.device_array(arr.size, dtype=np.int32) gpu_mark_segment_begins.forall(markers.size)(arr, markers) # Compute index of marked locations slots = prefixsum(markers) ct = slots[slots.size - 1] scanned = slots[:-1] # Compact segments begins = cuda.device_array(shape=int(ct), dtype=np.intp) gpu_scatter_segment_begins.forall(markers.size)(markers, scanned, begins) return begins
def run_gather(self, arr, diffs): h_out_idx = np.zeros(1, dtype=np.intp) out_queue = cuda.device_array(shape=self._maxk, dtype=arr.dtype) gpu_insert_if_masked.forall(arr.size)(arr, diffs, h_out_idx, out_queue) qsz = h_out_idx[0] if self._maxk >= 0: if qsz > self._maxk: msg = 'too many unique value: unique values ({}) > k ({})' raise ValueError(msg.format(qsz, self._maxk)) end = min(qsz, self._maxk) else: raise NotImplementedError('k is unbounded') vals = out_queue[:end] return vals
def append(self, other): """Append another column """ if self.has_null_mask or other.has_null_mask: raise NotImplementedError("append masked column is not supported") newsize = len(self) + len(other) # allocate memory mem = cuda.device_array(shape=newsize, dtype=self.data.dtype) newbuf = Buffer.from_empty(mem) # copy into new memory for buf in [self.data, other.data]: newbuf.extend(buf.to_gpu_array()) # return new column return self.replace(data=newbuf)
def test_event_elapsed_stream(self): N = 32 stream = cuda.stream() dary = cuda.device_array(N, dtype=np.double) evtstart = cuda.event() evtend = cuda.event() evtstart.record(stream=stream) cuda.to_device(np.arange(N), to=dary, stream=stream) evtend.record(stream=stream) evtend.wait(stream=stream) evtend.synchronize() # Exercise the code path evtstart.elapsed_time(evtend)
def run(self, arr, k): if k >= MAX_FAST_UNIQUE_K: raise NotImplementedError('k >= {}'.format(MAX_FAST_UNIQUE_K)) # setup mem outsz_ptr = cuda.device_array(shape=1, dtype=np.intp) out = cuda.device_array_like(arr) # kernel self._kernel[1, 64](arr, k, out, outsz_ptr) # copy to host unique_ct = outsz_ptr.copy_to_host()[0] if unique_ct < 0: raise ValueError('too many unique value (hint: increase k)') else: hout = out.copy_to_host() return hout[:unique_ct]
def _getOccupancyCUDA(coords, centers, channelsigmas, trunc=5, device=0, resD=None, asnumpy=True, threadsperblock=256): #cuda.select_device(device) if resD is None: resD = cuda.device_array((centers.shape[0], channelsigmas.shape[1]), dtype=np.float32) _memsetArray(resD, val=0) natomblocks = int(np.ceil(coords.shape[0] / threadsperblock)) blockspergrid = (centers.shape[0], natomblocks) centers = cuda.to_device(centers) coords = cuda.to_device(coords) channelsigmas = cuda.to_device(channelsigmas) _getOccupancyCUDAkernel[blockspergrid, threadsperblock](resD, coords, centers, channelsigmas, trunc * trunc) if asnumpy: return resD.copy_to_host()
def test_ufunc_arg(self): @vectorize(['f8(f8, f8)'], target='cuda') def vadd(a, b): return a + b # Case 1: use custom array as argument h_arr = np.random.random(10) arr = MyArray(cuda.to_device(h_arr)) val = 6 out = vadd(arr, val) np.testing.assert_array_equal(out.copy_to_host(), h_arr + val) # Case 2: use custom array as return out = MyArray(cuda.device_array(h_arr.shape)) returned = vadd(h_arr, val, out=out) np.testing.assert_array_equal(returned.copy_to_host(), h_arr + val)
def test_gufunc_arg(self): @guvectorize(['(f8, f8, f8[:])'], '(),()->()', target='cuda') def vadd(inp, val, out): out[0] = inp + val # Case 1: use custom array as argument h_arr = np.random.random(10) arr = MyArray(cuda.to_device(h_arr)) val = np.float64(7) out = vadd(arr, val) np.testing.assert_array_equal(out.copy_to_host(), h_arr + val) # Case 2: use custom array as return out = MyArray(cuda.device_array(h_arr.shape)) returned = vadd(h_arr, val, out=out) np.testing.assert_array_equal(returned.copy_to_host(), h_arr + val) self.assertEqual(returned.device_ctypes_pointer.value, out._arr.device_ctypes_pointer.value)
def test_gufunc_stream(self): @guvectorize([void(float32[:, :], float32[:, :], float32[:, :])], '(m,n),(n,p)->(m,p)', target='cuda') def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def cuda_dot2(a, b): if a.shape[1] != b.shape[0]: raise ValueError('shape %s does not match to shape %s' % (str(a.shape), str(b.shape))) try: c = np.ones((a.shape[0], b.shape[1]), dtype=a.dtype) TPB = 16 threadsperblock = (TPB, TPB) blockspergrid_x = math.ceil(a.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(b.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) da = cuda.to_device(a) db = cuda.to_device(b) dc = cuda.device_array((a.shape[0], b.shape[1]), dtype=a.dtype) cuda_dot_kernel2[blockspergrid, threadsperblock](da, db, dc) c = dc.copy_to_host() return c except: cuda.close() raise
def __init__(self, data, block_size, n_iter, seed=0): self.data = cuda.to_device(data) self.n_iter = n_iter # Parameters for the kernel launch self.block_size = block_size self.n_samples = data.shape[0] self.n_blocks = self.n_samples // block_size # Allocate an output array on the GPU self.output = cuda.device_array((n_iter,self.n_blocks,2)) # Create random number generators for each thread # NOTE: The threads within the same block should generate the same random numbers rng_states = np.empty(self.n_samples, dtype=xoroshiro128p_dtype) for i in range(self.n_samples): init_xoroshiro128p_state(rng_states, i, seed) # Init to a fixed state for j in range(i//block_size): # Jump forward block_index*2^64 steps xoroshiro128p_jump(rng_states, i) self.rng_states = cuda.to_device(rng_states) # Copy it to the GPU
def test_sum(dtype, nelem): data = gen_rand(dtype, nelem) d_data = cuda.to_device(data) d_result = cuda.device_array(libgdf.gdf_reduce_optimal_output_size(), dtype=d_data.dtype) col_data = new_column() gdf_dtype = get_dtype(dtype) libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem, gdf_dtype) libgdf.gdf_sum_generic(col_data, unwrap_devary(d_result), d_result.size) got = d_result.copy_to_host()[0] expect = data.sum() print('expect:', expect) print('got:', got) np.testing.assert_array_almost_equal(expect, got)
def neighbour_list(self): with cuda.gpus[self.gpu]: while True: cu_set_to_int[self.bpg, self.tpb](self.d_nc, 0) # reset situation while build nlist self.cu_nlist[self.bpg, self.tpb]( self.system.d_x, self.d_last_x, self.system.d_box, self.r_cut2, self.clist.d_cell_map, self.clist.d_cell_list, self.clist.d_cell_counts, self.clist.d_cells, self.d_nl, self.d_nc, self.d_n_max, self.d_situation) self.d_n_max.copy_to_host(self.p_n_max) cuda.synchronize() # n_max = np.array([120]) if self.p_n_max[0] > self.n_guess: self.n_guess = self.p_n_max[0] self.n_guess = self.n_guess + 8 - (self.n_guess & 7) self.d_nl = cuda.device_array( (self.system.N, self.n_guess), dtype=np.int32) else: break
def cuda_conv(x,K) : B=x.shape[0] H=x.shape[1] W=x.shape[2] k=K.shape[0] Cout=K.shape[-1] A_global_mem = cuda.to_device(x) B_global_mem = cuda.to_device(K) C_global_mem = cuda.device_array((B,H-k+1,W-k+1,Cout)) # threadsperblock = (,,1) # blockspergrid =(1,1,) threadsperblock = (16,16,4) blockspergrid =(int(math.ceil((H-k+1) / threadsperblock[0])), int(math.ceil((W-k+1) / threadsperblock[1])), int(math.ceil(Cout / threadsperblock[2]))) conv[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem) return C_global_mem.copy_to_host()
def process(self, inputs): df = inputs['points_df_in'] # DEBUGGING # try: # from dask.distributed import get_worker # worker = get_worker() # print('worker{} process NODE "{}" worker: {}'.format( # worker.name, self.uid, worker)) # except (ValueError, ImportError): # pass number_of_threads = 16 number_of_blocks = ((len(df) - 1) // number_of_threads) + 1 # Inits device array by setting 0 for each index. darr = cuda.device_array(len(df)) distance_kernel[(number_of_blocks, ), (number_of_threads, )](df['x'], df['y'], darr, len(df)) df['distance_numba'] = darr return {'distance_df': df}
def cuda_operation(): """Performs Vectorized Operations on GPU""" x = real_estate_array() y = real_estate_array() print("Moving calculations to GPU memory") x_device = cuda.to_device(x) y_device = cuda.to_device(y) out_device = cuda.device_array( shape=(x_device.shape[0],x_device.shape[1]), dtype=np.float32) print(x_device) print(x_device.shape) print(x_device.dtype) print("Calculating on GPU") add_ufunc(x_device,y_device, out=out_device) out_host = out_device.copy_to_host() print(f"Calculations from GPU {out_host}")
def update(self): with cuda.gpus[self.gpu]: while True: cu_set_to_int[self.bpg_cell, self.tpb](self.d_cell_counts, 0) cu_cell_list[self.bpg, self.tpb](self.system.d_x, self.system.d_box, self.d_ibox, self.d_cell_list, self.d_cell_counts, self.d_cells, self.d_cell_max) self.d_cell_max.copy_to_host(self.p_cell_max) cuda.synchronize() if self.p_cell_max[0] > self.cell_guess: self.cell_guess = self.p_cell_max[0] self.cell_guess = self.cell_guess + 8 - (self.cell_guess & 7) self.d_cell_list = cuda.device_array( (self.n_cell, self.cell_guess, self.system.n_dim + 1), dtype=self.system.dtype) else: break
def forward(self, input): batch_num, _, height, width = input.shape H_out = int((height + 2 * self.padding - self.weight.shape[2]) / self.stride) + 1 W_out = int((width + 2 * self.padding - self.weight.shape[3]) / self.stride) + 1 output_shape = (batch_num, self.out_channels, H_out, W_out) d_output = cuda.device_array(output_shape, dtype=np.float32) blockdim = (10, 10, 10) griddim_0 = ceil(batch_num * self.out_channels / blockdim[0]) griddim_1 = ceil(H_out / blockdim[1]) griddim_2 = ceil(W_out / blockdim[2]) griddim = (griddim_0, griddim_1, griddim_2) conv_step_gpu[griddim, blockdim](input, d_output, self.d_weight, self.stride, self.mod, self.filter_size, self.padding) return d_output
def prefixsum(vals): """Compute the full prefixsum. Given the input of N. The output size is N + 1. The first value is always 0. The last value is the sum of *vals*. """ from . import _gdf # Allocate output slots = cuda.device_array(shape=vals.size + 1, dtype=vals.dtype) # Fill 0 to slot[0] gpu_fill_value[1, 1](slots[:1], 0) # Compute prefixsum on the mask _gdf.apply_prefixsum(_gdf.columnview_from_devary(vals), _gdf.columnview_from_devary(slots[1:]), inclusive=True) return slots
def query_execute(df, expr, callenv): """Compile & execute the query expression Note: the expression is compiled and cached for future reuse. Parameters ---------- df : DataFrame expr : str boolean expression callenv : dict Contains keys 'locals' and 'globals' which are both dict. They represent the local and global dictionaries of the caller. """ # compile compiled = query_compile(expr) kernel = compiled['kernel'] # process env args envargs = [] envdict = callenv['globals'].copy() envdict.update(callenv['locals']) for name in compiled['refnames']: name = name[len(ENVREF_PREFIX):] try: val = envdict[name] if isinstance(val, dt.datetime): val = np.datetime64(val) except KeyError: msg = '{!r} not defined in the calling environment' raise NameError(msg.format(name)) else: envargs.append(val) # prepare col args colarrays = [df[col].to_gpu_array() for col in compiled['colnames']] # allocate output buffer nrows = len(df) out = cuda.device_array(nrows, dtype=np.bool_) # run kernel args = [out] + colarrays + envargs kernel.forall(nrows)(*args) return out
def test_sum_masked(nelem): dtype = np.float64 data = gen_rand(dtype, nelem) mask = gen_rand(np.int8, (nelem + 8 - 1) // 8) d_data = cuda.to_device(data) d_mask = cuda.to_device(mask) d_result = cuda.device_array(libgdf.gdf_reduce_optimal_output_size(), dtype=d_data.dtype) col_data = new_column() gdf_dtype = get_dtype(dtype) libgdf.gdf_column_view(col_data, unwrap_devary(d_data), unwrap_devary(d_mask), nelem, gdf_dtype) libgdf.gdf_sum_generic(col_data, unwrap_devary(d_result), d_result.size) got = d_result.copy_to_host()[0] boolmask = buffer_as_bits(mask)[:nelem] expect = data[boolmask].sum() np.testing.assert_almost_equal(expect, got)
def main(image1, image2): data_image1 = np.array(image1) data_image2 = np.array(image2) print(data_image1.shape, data_image2.shape) threadsperblock = (16, 16, 4) blocksper_x = int(math.ceil(data_image1.shape[0] // threadsperblock[0])) blocksper_y = int(math.ceil(data_image1.shape[1] // threadsperblock[1])) blocksper_z = int(math.ceil(data_image1.shape[2] // threadsperblock[2])) blockspergrid = (blocksper_x, blocksper_y, blocksper_z) input1 = cuda.to_device(data_image1) input2 = cuda.to_device(data_image2) output = cuda.device_array(data_image1.shape) sumImages[blockspergrid, threadsperblock](input1, input2, output) out = output.copy_to_host() out = out.astype('uint8') out = Image.fromarray(out) out.save("out.png")
def distance_cal(x, y): Nx = x.size Ny = y.size points = [[1, 0], [-1, 0], [0, 1]] d_x = cuda.to_device(x) d_y = cuda.to_device(y) d_points = cuda.to_device(np.array(points)) d_out = cuda.device_array((Nx, Ny)) TPBX = 8 TPBY = 8 gridDims = ((Nx + TPBX - 1) // TPBX, (Ny + TPBY - 1) // TPBY) blockDims = (TPBX, TPBY) distance_kernel[gridDims, blockDims](d_x, d_y, d_points, d_out) return d_out.copy_to_host()
def cu_mat_2d_to_4d(A, dim, from_host=False, to_host=False): """ :param A: :param from_host: :param to_host: :return: A:2d matrix, shape: A's shape """ if from_host: A = cuda.to_device(A.astype(np.float32)) assert len(A.shape) == 2 and A.shape[0] == dim[0] res = cuda.device_array(shape=(dim[0], dim[1], dim[2], dim[3]), dtype=np.float32) grid_dim, block_dim = auto_detect(A.shape) _cu_mat_2d_to_4d[grid_dim, block_dim](A, res) if to_host: return res.copy_to_host(), A.shape else: return res, A.shape
def WT_as_f(x, k, c, L): # fund. frequency k1 = 2.0 * np.pi / L # Set up kernel blockSize = (TPB, TPB) numBlocksX = (x.shape[0] + blockSize[0] - 1) // blockSize[0] numBlocksK = (k.shape[0] + blockSize[1] - 1) // blockSize[1] numBlocks = (numBlocksX, numBlocksK) # output on device dW = cuda.device_array((x.shape[0], k.shape[0]), np.complex128) # input dC = cuda.to_device(np.ascontiguousarray(c)) dx = cuda.to_device(x) dk = cuda.to_device(k) # call kernel wignerKernel[numBlocks, blockSize](dC, dx, dk, dW, k1) return 2.0 * dW.copy_to_host()
def relu(input, device=None): if device is (None or 'gpu'): if cuda.is_available(): device = 'gpu' else: device = 'cpu' if device is 'gpu': batch_num, channels, height, width = input.shape d_output = cuda.device_array(input.shape, dtype=np.float32) blockdim = (10, 10, 10) griddim_0 = ceil(batch_num * channels / blockdim[0]) griddim_1 = ceil(height / blockdim[1]) griddim_2 = ceil(width / blockdim[2]) griddim = (griddim_0, griddim_1, griddim_2) relu_gpu[griddim, blockdim](input, d_output) return d_output else: return relu_cpu(input)
def NNGPU_class(X, W, config): maxNeuronas = max(config) config = np.append(X.shape[0], config[:]) # Se inicializa la matriz de pesos maxPesos = max(config) threadsPerBlock = maxNeuronas # en cada bloque se calcula una neurona blocksPerGrid = maxNeuronas # Se mueven los datos necesarios para el GPU Wg = cuda.to_device(W) Xg = cuda.to_device(X) configG = cuda.to_device(config) output = cuda.device_array([config.shape[0], config.max()]) NNCUDA_class[blocksPerGrid, threadsPerBlock](Xg, Wg, configG, output) ret = output.copy_to_host() #P = Wg.copy_to_host() #print (P) return ret[-1]
def apply_segsort(col_keys, col_vals, segments, descending=False): """Inplace segemented sort Parameters ---------- col_keys : Column col_vals : Column segments : device array """ # prepare nelem = len(col_keys) seg_dtype = np.uint32 d_fullsegs = cuda.device_array(segments.size + 1, dtype=seg_dtype) d_begins = d_fullsegs[:-1] d_ends = d_fullsegs[1:] # Note: .astype is required below because .copy_to_device # is just a plain memcpy d_begins.copy_to_device(cudautils.astype(segments, dtype=seg_dtype)) d_ends[-1:].copy_to_device(np.require([nelem], dtype=seg_dtype)) begin_bit = 0 end_bit = col_keys.dtype.itemsize * 8 sizeof_key = col_keys.data.dtype.itemsize sizeof_val = col_vals.data.dtype.itemsize # sort plan = libgdf.gdf_segmented_radixsort_plan(nelem, descending, begin_bit, end_bit) try: libgdf.gdf_segmented_radixsort_plan_setup(plan, sizeof_key, sizeof_val) libgdf.gdf_segmented_radixsort_generic(plan, col_keys.cffi_view, col_vals.cffi_view, segments.size, unwrap_devary(d_begins), unwrap_devary(d_ends)) finally: libgdf.gdf_segmented_radixsort_plan_free(plan)
def condense(u, buffer): #u = input voxel model #buffer = number of layers of voxels around the boundaries that are left empty #Outputs a new matrix that is fitted to the input voxel model, removing layers #that don't store geometry. m, n, p = u.shape TPBX, TPBY, TPBZ = TPB, TPB, TPB minX, maxX, minY, maxY, minZ, maxZ = -1, -1, -1, -1, -1, -1 i, j, k = 0, 0, 0 while minX < 0: if np.amin(u[i, :, :]) < 0: minX = i else: i += 1 while minY < 0: if np.amin(u[:, j, :]) < 0: minY = j else: j += 1 while minZ < 0: if np.amin(u[:, :, k]) < 0: minZ = k else: k += 1 i, j, k = 1, 1, 1 while maxX < 0: if np.amin(u[m - i, :, :]) < 0: maxX = m - i else: i += 1 while maxY < 0: if np.amin(u[:, n - j, :]) < 0: maxY = n - j else: j += 1 while maxZ < 0: if np.amin(u[:, :, p - k]) < 0: maxZ = p - k else: k += 1 xSize = (np.ceil((2 * buffer + maxX - minX) / TPB) * TPB).astype(int) ySize = (np.ceil((2 * buffer + maxY - minY) / TPB) * TPB).astype(int) zSize = (np.ceil((2 * buffer + maxZ - minZ) / TPB) * TPB).astype(int) d_u = cuda.to_device(u) d_uCondensed = cuda.device_array(shape=[xSize, ySize, zSize], dtype=np.float32) gridDims = (xSize + TPBX - 1) // TPBX, (ySize + TPBY - 1) // TPBY, ( zSize + TPBZ - 1) // TPBZ blockDims = TPBX, TPBY, TPBZ condenseKernel[gridDims, blockDims](d_u, d_uCondensed, buffer, minX, minY, minZ) return d_uCondensed.copy_to_host()
def AveragesOnShellsInnerLogicCCuda(\ retNowR_global_mem,\ retNowI_global_mem,\ n1ofR_global_mem,\ n2ofR_global_mem,\ NumAtROutPre_global_mem,\ End,\ Start,\ NumOnSurf,\ r): threadsperblock = (1024,1,1) blockspergrid_x = int(math.ceil(retNowR_global_mem[r][:NumOnSurf].shape[0]/threadsperblock[0])) blockspergrid = (blockspergrid_x,1,1) # set up stream stream = cuda.stream() device_array_start = time.time() reduced_global_mem = cuda.device_array((4,(End-Start))) print("Shape of NumAtROutPre_global_mem is ",NumAtROutPre_global_mem.shape) print("Shape of reduced_global_mem is ",reduced_global_mem.shape) #print("Time to create cuda.device_array is ",time.time() - device_array_start) filter_time = time.time() cuda_kernels.filter_and_sum[threadsperblock,blockspergrid](\ retNowR_global_mem,\ retNowI_global_mem,\ n1ofR_global_mem,\ n2ofR_global_mem,\ NumAtROutPre_global_mem,\ reduced_global_mem,\ End,\ Start,\ NumOnSurf,\ r) stream.synchronize() print("Time to complete filter_and_sum is ",time.time() - filter_time) reduced_start = time.time() reduced = reduced_global_mem.copy_to_host() print("Time to transfer reduced to host is ",time.time() - reduced_start) return reduced
def cuda_deposition_arrays(Nz = None, Nr = None, fieldtype = None): """ Create empty arrays on the GPU for the charge and current deposition in each of the 4 possible direction. ########################################### # Needs to be moved to the fields package! ########################################### Parameters ---------- Nz : int Number of cells in z. Nr : int Number of cells in r. fieldtype : string Either 'rho' or 'J'. """ # Create empty arrays to store the four different possible # cell directions a particle can deposit to. if fieldtype == 'rho': # Rho - third dimension represents 2 modes rho0 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) rho1 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) rho2 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) rho3 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) return rho0, rho1, rho2, rho3 if fieldtype == 'J': # J - third dimension represents 2 modes # times 3 dimensions (r, t, z) J0 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) J1 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) J2 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) J3 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) return J0, J1, J2, J3
def copy_to_dense(data, mask, out=None): """Copy *data* with validity bits in *mask* into *out*. The output array can be specified in `out`. Return a 2-tuple of: * number of non-null element * a dense gpu array given the data and mask gpu arrays. """ slots, sz = mask_assign_slot(size=data.size, mask=mask) if out is None: # output buffer is not provided # allocate one alloc_shape = sz out = cuda.device_array(shape=alloc_shape, dtype=data.dtype) else: # output buffer is provided # check it if sz >= out.size: raise ValueError('output array too small') gpu_copy_to_dense.forall(data.size)(data, mask, slots, out) return (sz, out)
def apply_label(arr, cats, dtype, na_sentinel): """ Parameters ---------- arr : device array data cats : device array Unique category value dtype : np.dtype output array dtype na_sentinel : int Value to indicate missing value Returns ------- result : device array """ encs = np.asarray(list(range(cats.size))) d_encs = to_device(encs) out = cuda.device_array(shape=arr.size, dtype=dtype) configured = gpu_label.forall(out.size) configured(arr, cats, d_encs, na_sentinel, out) return out
def test_gufunc_stream(self): @guvectorize( [void(float32[:, :], float32[:, :], float32[:, :])], "(m,n),(n,p)->(m,p)", target="cuda", ) def matmulcore(A, B, C): m, n = A.shape n, p = B.shape for i in range(m): for j in range(p): C[i, j] = 0 for k in range(n): C[i, j] += A[i, k] * B[k, j] gufunc = matmulcore gufunc.max_blocksize = 512 # cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() Gold = ut.matrix_multiply(A, B) self.assertTrue(np.allclose(C, Gold))
def softmax_backprop_kernel_wrapper(d_d_L_d_out, d_weight, d_maxpoolOutput, d_postSoftmax, numImage, d_d_L_d_input, d_d_L_d_w, d_d_L_d_b, blockSize=(32, 32)): # Tính d_d_L_d_preSoftmax cũng là tính d_d_L_d_b vì nó trỏ cùng 1 vùng nhớ d_d_L_d_preSoftmax = d_d_L_d_b gridSize = math.ceil(numImage / blockSize[1]) softmax_backprop_kernel[gridSize, blockSize[1]](d_d_L_d_out, d_postSoftmax, numImage, d_d_L_d_preSoftmax) # Tính d_d_L_d_w d_d_L_d_preSoftmaxReshape = d_d_L_d_preSoftmax[:numImage].reshape( numImage, d_d_L_d_preSoftmax.shape[1], 1) d_maxpoolOutputReshape = d_maxpoolOutput[:numImage].reshape( numImage, 1, d_weight.shape[1]) gridSize = (math.ceil(d_maxpoolOutputReshape.shape[2] / blockSize[0]), math.ceil(d_d_L_d_preSoftmaxReshape.shape[1] / blockSize[1]), d_d_L_d_preSoftmaxReshape.shape[0]) dot_3D_kernel[gridSize, blockSize](d_d_L_d_preSoftmaxReshape, d_maxpoolOutputReshape, d_d_L_d_w) # Tính d_d_L_d_input d_d_L_d_input_temp = cuda.device_array((numImage, 1, d_weight.shape[1]), dtype=float) d_d_L_d_preSoftmaxReshape = d_d_L_d_preSoftmax[:numImage].reshape( numImage, 1, d_d_L_d_preSoftmax.shape[1]) gridSize = (math.ceil(d_d_L_d_input_temp.shape[2] / blockSize[0]), math.ceil(d_d_L_d_input_temp.shape[1] / blockSize[1]), d_d_L_d_input_temp.shape[0]) dot_3D2D_kernel[gridSize, blockSize](d_d_L_d_preSoftmaxReshape, d_weight, d_d_L_d_input_temp) d_d_L_d_input[0, :numImage] = d_d_L_d_input_temp[:numImage].reshape( d_maxpoolOutput[:numImage].shape)
def matmul(A, B, matmultype='forward'): global PARALLELIZE if not PARALLELIZE: # NORMAL gradient = np.zeros((A.shape[0], B.shape[1])) for i in range(A.shape[0]): for j in range(B.shape[1]): for k in range(A.shape[1]): gradient[i][j] += A[i][k] * B[k][j] return gradient # PARALLELIZED global global_feat global global_feat_val global global_feat_transpose B_global_mem = cuda.to_device(B) C_global_mem = cuda.device_array((A.shape[0], B.shape[1])) # Configure the blocks threadsperblock = (TPB, TPB) blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[1])) blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[0])) blockspergrid = (blockspergrid_x, blockspergrid_y) # Start the kernel if matmultype == 'forward': matmul_kernel[blockspergrid, threadsperblock](global_feat, B_global_mem, C_global_mem) elif matmultype == 'validation': matmul_kernel[blockspergrid, threadsperblock](global_feat_val, B_global_mem, C_global_mem) elif matmultype == 'backward': matmul_kernel[blockspergrid, threadsperblock](global_feat_transpose, B_global_mem, C_global_mem) res = C_global_mem.copy_to_host() return res
def main(): n = 20000000 x = np.arange(n).astype(np.int32) y = 2 * x x_device = cuda.to_device(x) y_device = cuda.to_device(y) out_device = cuda.device_array(n) threads_per_block = 1024 blocks_per_grid = math.ceil(n / threads_per_block) start = time() gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, out_device, n) #gpu_result = out_device.copy_to_host() cuda.synchronize() print("gpu vector add time " + str(time() - start)) start = time() gpu_add_stride[80, threads_per_block](x_device, y_device, out_device, n) #gpu_result = out_device.copy_to_host() cuda.synchronize() print("gpu stride vector add time " + str(time() - start))
def query_execute(df, expr, callenv): """Compile & execute the query expression Note: the expression is compiled and cached for future reuse. Parameters ---------- df : DataFrame expr : str boolean expression callenv : dict Contains keys 'locals' and 'globals' which are both dict. They represent the local and global dictionaries of the caller. """ # compile compiled = query_compile(expr) kernel = compiled['kernel'] # process env args envargs = [] envdict = callenv['globals'].copy() envdict.update(callenv['locals']) for name in compiled['refnames']: name = name[len(ENVREF_PREFIX):] try: val = envdict[name] except KeyError: msg = '{!r} not defined in the calling environment' raise NameError(msg.format(name)) else: envargs.append(val) # prepare col args colarrays = [df[col].to_gpu_array() for col in compiled['colnames']] # allocate output buffer nrows = len(df) out = cuda.device_array(nrows, dtype=np.bool_) # run kernel args = [out] + colarrays + envargs kernel.forall(nrows)(*args) return out
def matrix_mult(m1, m2): A_global_mem = cuda.to_device(m1) B_global_mem = cuda.to_device(m2) # Allocate memory on the device for the result C_global_mem = cuda.device_array( (A_global_mem.shape[0], B_global_mem.shape[1])) # Configure the blocks threadsperblock = (32, 32) blockspergrid_x = int(math.ceil(m1.shape[0] / threadsperblock[0])) blockspergrid_y = int(math.ceil(m2.shape[1] / threadsperblock[1])) blockspergrid = (blockspergrid_x, blockspergrid_y) # # Start the kernel matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem) # fast_matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem) #out = matmul_cuda(A_global_mem, B_global_mem) # Copy the result back to the host return C_global_mem.copy_to_host()
def host_naive(A, B): '''host code for calling naive kernal ''' A = np.array(A) B = np.array(B) m = A.shape[0] n = B.shape[1] C = np.full((m, n), 0, dtype=np.float64) d_A = cuda.to_device(A) # d_ --> device d_B = cuda.to_device(B) d_C = cuda.device_array(C.shape, np.float64) threadsperblock = (TPB, TPB) blockspergrid_x = math.ceil(A.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(B.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) mat_mul_naive_kernal[blockspergrid, threadsperblock](d_A, d_B, d_C) return d_C.copy_to_host()
def __init__(self, shape, blocks, coupling=None, temperature=0, field=0): """ :param shape: number of spins along each axis :param blocks: size of subdivisions along each axis, for parallel computation :param coupling: coupling array, must be (2*r+1)x...x(2*r+1) with (r,...,r) denoting the center. :param temperature: unitless temperature :param field: unitless applied field """ self.num_dim = len(shape) self.shape = np.array(shape, dtype=np.int32) self.num_spins = np.prod(self.shape) self.blocks = np.array(blocks, dtype=np.int32) if np.any(self.shape & (self.shape - 1)): raise ValueError("Shape must consist of powers of 2,") if self.shape[-1] % 8: raise ValueError("Last shape must be multiple of 8") if np.any(self.shape % self.blocks): raise ValueError("Blocks do not evenly divide shape") self.shape_shifts = cuda.to_device(ncil.log2(self.shape)) self.block_shifts = cuda.to_device(ncil.log2(self.blocks)) self.spins = cuda.device_array((self.num_spins // 8, ), np.uint8) self.spins[:] = np.random.randint(0, 256, size=self.num_spins >> 3, dtype=np.uint8) if coupling is None: coupling = np.empty(np.zeros(self.num_dim), dtype=np.float64) self.coupling_indices = cuda.to_device( np.vstack(np.where(coupling != 0.0)).T - (coupling.shape[0] // 2)) self.coupling_constants = cuda.to_device( coupling[np.where(coupling != 0.0)]) self.temperature = temperature self.field = field
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # Instantiate cuRAND PRNG prng = PRNG(PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Simulation loop d_last = cuda.to_device(paths[:, 0]) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j]) step(d_last, dt, c0, c1, d_normdist, out=d_paths) d_paths.copy_to_host(paths[:, j]) d_last = d_paths