def matmulfast(A, B, C): x = hsa.get_global_id(0) y = hsa.get_global_id(1) tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) if x >= C.shape[0] or y >= C.shape[1]: return tmp = 0 for i in range(gridsize): # preload sA[tx, ty] = A[x, ty + i * blocksize] sB[tx, ty] = B[tx + i * blocksize, y] # wait for preload to end hsa.barrier(1) # compute loop for j in range(blocksize): tmp += sA[tx, j] * sB[j, ty] # wait for compute to end hsa.barrier(1) C[x, y] = tmp
def matmulfast(A, B, C): x = hsa.get_global_id(0) y = hsa.get_global_id(1) tx = hsa.get_local_id(0) ty = hsa.get_local_id(1) sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32) if x >= C.shape[0] or y >= C.shape[1]: return tmp = 0 for i in range(gridsize): # preload sA[tx, ty] = A[x, ty + i * blocksize] sB[tx, ty] = B[tx + i * blocksize, y] # wait for preload to end hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) # compute loop for j in range(blocksize): tmp += sA[tx, j] * sB[j, ty] # wait for compute to end hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE) C[x, y] = tmp
def matmul(A, B, C): i = hsa.get_global_id(0) j = hsa.get_global_id(1) if i >= C.shape[0] or j >= C.shape[1]: return tmp = 0 for k in range(A.shape[1]): tmp += A[i, k] * B[k, j] C[i, j] = tmp
def twice(A): i = hsa.get_global_id(0) d = A[i] # no argument defaults to global mem fence # which is the same for local in hsail hsa.barrier() A[i] = d * 2
def hsa_uni_kde(support, samples, bandwidth, pdf): i = hsa.get_global_id(0) if i < support.size: supp = support[i] total = 0 for j in range(samples.size): total += kernel((samples[j] - supp) / bandwidth) / bandwidth pdf[i] = total / samples.size
def scan_block(data, sums): sm_data = hsa.shared.array(128, dtype=intp) tid = hsa.get_local_id(0) gid = hsa.get_global_id(0) blkid = hsa.get_group_id(0) sm_data[tid] = data[gid] prefixsum = device_scan_generic(tid, sm_data) data[gid] = sm_data[tid] sums[blkid, tid] = prefixsum
def reverse_array(A): sm = hsa.shared.array(shape=blocksize, dtype=float32) i = hsa.get_global_id(0) # preload sm[i] = A[i] # barrier hsa.barrier(1) # local mem fence # write A[i] += sm[blocksize - 1 - i]
def reverse_array(A): sm = hsa.shared.array(shape=blocksize, dtype=float32) i = hsa.get_global_id(0) # preload sm[i] = A[i] # barrier hsa.barrier(hsa.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += sm[blocksize - 1 - i]
def scan_block(data, sums): sm_data = hsa.shared.array(128, dtype=intp) tid = hsa.get_local_id(0) gid = hsa.get_global_id(0) blkid = hsa.get_group_id(0) scanval, prefixsum = device_scan(tid, data[gid], sm_data, False) data[gid] = scanval sums[blkid, tid] = prefixsum
def udt(output): global_id = hsa.get_global_id(0) global_size = hsa.get_global_size(0) local_id = hsa.get_local_id(0) group_id = hsa.get_group_id(0) num_groups = hsa.get_num_groups(0) workdim = hsa.get_work_dim() local_size = hsa.get_local_size(0) output[0, group_id, local_id] = global_id output[1, group_id, local_id] = global_size output[2, group_id, local_id] = local_id output[3, group_id, local_id] = local_size output[4, group_id, local_id] = group_id output[5, group_id, local_id] = num_groups output[6, group_id, local_id] = workdim
def kernel_scatter(size, shift, shuffled, scanblocksum, localscan, shuffled_sorted, indices, indices_sorted, store_indices): tid = hsa.get_local_id(0) blkid = hsa.get_group_id(0) gid = hsa.get_global_id(0) if gid < size: curdata = uintp(shuffled[blkid, tid]) data_radix = uintp((curdata >> uintp(shift)) & uintp(RADIX_MINUS_1)) pos = scanblocksum[data_radix, blkid] + localscan[blkid, tid] shuffled_sorted[pos] = curdata if store_indices: indices_sorted[pos] = indices[gid]
def hsa_multi_kde(support, samples, bandwidths, pdf): """ Expects 2d arrays for samples and support: (num_observations, num_variables) """ nvar = support.shape[1] i = hsa.get_global_id(0) tid = hsa.get_local_id(0) valid = i < support.shape[0] sum = 0 sm_samples = hsa.shared.array(SAMPLES_SIZE, dtype=float64) sm_bandwidths = hsa.shared.array(MAX_NDIM, dtype=float64) sm_support = hsa.shared.array(SAMPLES_SIZE, dtype=float64) if valid: for k in range(nvar): sm_support[k, tid] = support[i, k] if tid < nvar: sm_bandwidths[tid] = bandwidths[tid] for base in range(0, samples.shape[0], BLOCKSIZE): loadcount = min(samples.shape[0] - base, BLOCKSIZE) hsa.barrier() # Preload samples tile if tid < loadcount: for k in range(nvar): sm_samples[k, tid] = samples[base + tid, k] hsa.barrier() # Compute on the tile if valid: for j in range(loadcount): prod = 1 for k in range(nvar): bw = sm_bandwidths[k] diff = sm_samples[k, j] - sm_support[k, tid] prod *= kernel(diff / bw) / bw sum += prod if valid: pdf[i] = sum / samples.shape[0]
def hsa_multi_kde(support, samples, bandwidths, pdf): """ Expects 2d arrays for samples and support: (num_observations, num_variables) """ nvar = support.shape[1] i = hsa.get_global_id(0) if i < support.shape[0]: sum = 0 for j in range(samples.shape[0]): prod = 1 for k in range(nvar): bw = bandwidths[k] diff = samples[j, k] - support[i, k] prod *= kernel(diff / bw) / bw sum += prod pdf[i] = sum / samples.shape[0]
def outer(A, B): i = hsa.get_global_id(0) if i < A.size: A[i] = inner(A[i], B[i])
def twice(A): i = hsa.get_global_id(0) d = A[i] hsa.barrier(1) # local mem fence A[i] = d * 2
def foo(inp, out): gid = hsa.get_global_id(0) temp = hsa.shared.array(2, dtype=intp) out[gid] = shuf_device_inclusive_scan(inp[gid], temp)
def outer(dst, src): tid = hsa.get_global_id(0) if tid < dst.size: dst[tid] = inner(src, tid)
def kernel_warp_reduce(inp, out): idx = hsa.get_global_id(0) val = inp[idx] out[idx] = wave_reduce(val)
def fn(dst, src): i = hsa.get_global_id(0) if i < dst.size: dst[i] = math_fn(src[i])
def foo(inp, out): gid = hsa.get_global_id(0) out[gid] = shuffle_up(inp[gid], 1)
def test_group_reduce(inp, out): gid = hsa.get_global_id(0) val = inp[gid] val = group_reduce_min_float64(val) out[gid] = val
def test_group_reduce(inp, out): gid = hsa.get_global_id(0) val = inp[gid] val = group_reduce_max_intp(val) out[gid] = val
def hsa_var_diff_kernel(diff, inputs, mean): gid = hsa.get_global_id(0) if gid < inputs.size: val = inputs[gid] x = val - mean diff[gid] = x * x
def copy_kernel(out, inp): i = hsa.get_global_id(0) if i < out.size: out[i] = inp[i]
def fn(dst, src1, src2): i = hsa.get_global_id(0) if i < dst.size: dst[i] = math_fn(src1[i], src2[i])
def udt_devfunc_caller(dst, src): i = hsa.get_global_id(0) if i < dst.size: dst[i] = devfn(src, i)
def foo(inp, out): gid = hsa.get_global_id(0) out[gid] = shuf_wave_inclusive_scan(inp[gid])
def copy_vector(dst, src): tid = hsa.get_global_id(0) if tid < dst.size: dst[tid] = src[tid]
def twice(A): i = hsa.get_global_id(0) d = A[i] hsa.barrier(hsa.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = d * 2
def udt(output): global_id = hsa.get_global_id(0) workdim = hsa.get_work_dim() output[global_id] = workdim
def assign_value(out, inp): i = hsa.get_global_id(0) if i < out.size: out[i] = inp
def udt(output): global_id = hsa.get_global_id(0) output[global_id] = global_id
def udt(output): global_id = hsa.get_global_id(0) local_id = hsa.get_local_id(0) output[global_id] = local_id
def udt(output): global_id = hsa.get_global_id(0) group_id = hsa.get_group_id(0) output[global_id] = group_id + 1
def udt2(output): g0 = hsa.get_global_id(0) g1 = hsa.get_global_id(1) output[g0, g1] = hsa.get_work_dim()