def matmulfast(A, B, C): x = roc.get_global_id(0) y = roc.get_global_id(1) tx = roc.get_local_id(0) ty = roc.get_local_id(1) sA = roc.shared.array(shape=(blocksize, blocksize), dtype=float32) sB = roc.shared.array(shape=(blocksize, blocksize), dtype=float32) if x >= C.shape[0] or y >= C.shape[1]: return tmp = 0 for i in range(gridsize): # preload sA[tx, ty] = A[x, ty + i * blocksize] sB[tx, ty] = B[tx + i * blocksize, y] # wait for preload to end roc.barrier(roc.CLK_GLOBAL_MEM_FENCE) # compute loop for j in range(blocksize): tmp += sA[tx, j] * sB[j, ty] # wait for compute to end roc.barrier(roc.CLK_GLOBAL_MEM_FENCE) C[x, y] = tmp
def matmul(A, B, C): i = roc.get_global_id(0) j = roc.get_global_id(1) if i >= C.shape[0] or j >= C.shape[1]: return tmp = 0 for k in range(A.shape[1]): tmp += A[i, k] * B[k, j] C[i, j] = tmp
def twice(A): i = roc.get_global_id(0) d = A[i] # no argument defaults to global mem fence # which is the same for local in hsail roc.barrier() A[i] = d * 2
def scan_block(data, sums): sm_data = roc.shared.array(128, dtype=intp) tid = roc.get_local_id(0) gid = roc.get_global_id(0) blkid = roc.get_group_id(0) sm_data[tid] = data[gid] prefixsum = device_scan_generic(tid, sm_data) data[gid] = sm_data[tid] sums[blkid, tid] = prefixsum
def roc_uni_kde(support, samples, bandwidth, pdf): i = roc.get_global_id(0) if i < support.size: supp = support[i] total = 0 for j in range(samples.size): total += kernel((samples[j] - supp) / bandwidth) / bandwidth pdf[i] = total / samples.size
def reverse_array(A): sm = roc.shared.array(shape=blocksize, dtype=float32) i = roc.get_global_id(0) # preload sm[i] = A[i] # barrier roc.barrier(roc.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += sm[blocksize - 1 - i]
def scan_block(data, sums): sm_data = roc.shared.array(128, dtype=intp) tid = roc.get_local_id(0) gid = roc.get_global_id(0) blkid = roc.get_group_id(0) scanval, prefixsum = device_scan(tid, data[gid], sm_data, False) data[gid] = scanval sums[blkid, tid] = prefixsum
def kernel_scatter(size, shift, shuffled, scanblocksum, localscan, shuffled_sorted, indices, indices_sorted, store_indices): tid = roc.get_local_id(0) blkid = roc.get_group_id(0) gid = roc.get_global_id(0) if gid < size: curdata = uintp(shuffled[blkid, tid]) data_radix = uintp((curdata >> uintp(shift)) & uintp(RADIX_MINUS_1)) pos = scanblocksum[data_radix, blkid] + localscan[blkid, tid] shuffled_sorted[pos] = curdata if store_indices: indices_sorted[pos] = indices[gid]
def udt(output): global_id = roc.get_global_id(0) global_size = roc.get_global_size(0) local_id = roc.get_local_id(0) group_id = roc.get_group_id(0) num_groups = roc.get_num_groups(0) workdim = roc.get_work_dim() local_size = roc.get_local_size(0) output[0, group_id, local_id] = global_id output[1, group_id, local_id] = global_size output[2, group_id, local_id] = local_id output[3, group_id, local_id] = local_size output[4, group_id, local_id] = group_id output[5, group_id, local_id] = num_groups output[6, group_id, local_id] = workdim
def roc_multi_kde(support, samples, bandwidths, pdf): """ Expects 2d arrays for samples and support: (num_observations, num_variables) """ nvar = support.shape[1] i = roc.get_global_id(0) tid = roc.get_local_id(0) valid = i < support.shape[0] sum = 0 sm_samples = roc.shared.array(SAMPLES_SIZE, dtype=float64) sm_bandwidths = roc.shared.array(MAX_NDIM, dtype=float64) sm_support = roc.shared.array(SAMPLES_SIZE, dtype=float64) if valid: for k in range(nvar): sm_support[k, tid] = support[i, k] if tid < nvar: sm_bandwidths[tid] = bandwidths[tid] for base in range(0, samples.shape[0], BLOCKSIZE): loadcount = min(samples.shape[0] - base, BLOCKSIZE) roc.barrier() # Preload samples tile if tid < loadcount: for k in range(nvar): sm_samples[k, tid] = samples[base + tid, k] roc.barrier() # Compute on the tile if valid: for j in range(loadcount): prod = 1 for k in range(nvar): bw = sm_bandwidths[k] diff = sm_samples[k, j] - sm_support[k, tid] prod *= kernel(diff / bw) / bw sum += prod if valid: pdf[i] = sum / samples.shape[0]
def roc_multi_kde(support, samples, bandwidths, pdf): """ Expects 2d arrays for samples and support: (num_observations, num_variables) """ nvar = support.shape[1] i = roc.get_global_id(0) if i < support.shape[0]: sum = 0 for j in range(samples.shape[0]): prod = 1 for k in range(nvar): bw = bandwidths[k] diff = samples[j, k] - support[i, k] prod *= kernel(diff / bw) / bw sum += prod pdf[i] = sum / samples.shape[0]
def udt(output): global_id = roc.get_global_id(0) workdim = roc.get_work_dim() output[global_id] = workdim
def test_group_reduce(inp, out): gid = roc.get_global_id(0) val = inp[gid] val = group_reduce_max_intp(val) out[gid] = val
def outer(A, B): i = roc.get_global_id(0) if i < A.size: A[i] = inner(A[i], B[i])
def udt(output): global_id = roc.get_global_id(0) local_id = roc.get_local_id(0) output[global_id] = local_id
def udt2(output): g0 = roc.get_global_id(0) g1 = roc.get_global_id(1) output[g0, g1] = roc.get_work_dim()
def add1_kernel(dst, src): i = roc.get_global_id(0) if i < dst.size: dst[i] = src[i] + 1
def fn(dst, src1, src2): i = roc.get_global_id(0) if i < dst.size: dst[i] = math_fn(src1[i], src2[i])
def fn(dst, src): i = roc.get_global_id(0) if i < dst.size: dst[i] = math_fn(src[i])
def foo(inp, out): gid = roc.get_global_id(0) out[gid] = shuffle_up(inp[gid], 1)
def copy_vector(dst, src): tid = roc.get_global_id(0) if tid < dst.size: dst[tid] = src[tid]
def udt(output): global_id = roc.get_global_id(0) output[global_id] = global_id
def copy_kernel(dst, src): i = roc.get_global_id(0) if i < dst.size: dst[i] = src[i]
def udt(output): global_id = roc.get_global_id(0) group_id = roc.get_group_id(0) output[global_id] = group_id + 1
def copy_kernel(out, inp): i = roc.get_global_id(0) if i < out.size: out[i] = inp[i]
def foo(inp, out): gid = roc.get_global_id(0) temp = roc.shared.array(2, dtype=int32) out[gid] = shuf_device_inclusive_scan(inp[gid], temp)
def assign_value(out, inp): i = roc.get_global_id(0) if i < out.size: out[i] = inp
def udt_devfunc_caller(dst, src): i = roc.get_global_id(0) if i < dst.size: dst[i] = devfn(src, i)
def test_group_reduce(inp, out): gid = roc.get_global_id(0) val = inp[gid] val = group_reduce_min_float64(val) out[gid] = val
def twice(A): i = roc.get_global_id(0) d = A[i] roc.barrier(roc.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = d * 2
def foo(inp, out): gid = roc.get_global_id(0) out[gid] = shuf_wave_inclusive_scan_int32(inp[gid])
def outer(dst, src): tid = roc.get_global_id(0) if tid < dst.size: dst[tid] = inner(src, tid)