def tasklets_multioutput(A: dace.float32[1], B: dace.float32[1], C: dace.float32[1]): tmp_a = dace.define_local_scalar(dace.float32) tmp_b = dace.define_local_scalar(dace.float32) tmp_d = dace.define_local_scalar(dace.float32) with dace.tasklet: a << A[0] a_out >> tmp_a a_out = sqrt(a) with dace.tasklet: b << B[0] b_out >> tmp_b d_out >> tmp_d b_out = log(b + 1) d_out = b with dace.tasklet: a << tmp_a b << tmp_b d << tmp_d c >> C[0] c = a * b * d
def compute_row(i): rowptr = dace.define_local_scalar(dace.uint32) rowend = dace.define_local_scalar(dace.uint32) rowptr << A_row[i] rowend << A_row[i + 1] @dace.map(_[rowptr:rowend]) def compute(j): a << A_val[j] in_x << x[A_col[j]] out >> b(1, lambda x, y: x + y, 0)[i] out = a * in_x
def nccl_send_recv(): out = dace.ndarray([num_gpus, 2], dtype) pinned_out = dace.ndarray([num_gpus, 2], dtype, storage=dace.StorageType.CPU_Pinned) for gpu_id in dace.map[0:num_gpus]: # Transients send_buffer = dace.ndarray([2], dtype, storage=dace.StorageType.GPU_Global) recv_buffer = dace.ndarray([2], dtype, storage=dace.StorageType.GPU_Global) # Init transients for i in dace.map[0:2]: send_buffer[i] = gpu_id group_handle = dace.define_local_scalar( dace.int32, storage=dace.StorageType.GPU_Global) if gpu_id == 0: dace.comm.nccl.Send(send_buffer, 1, group_handle=group_handle) dace.comm.nccl.Recv(recv_buffer, 1, group_handle=group_handle) else: dace.comm.nccl.Send(send_buffer, 0, group_handle=group_handle) dace.comm.nccl.Recv(recv_buffer, 0, group_handle=group_handle) pinned_out[gpu_id, :] = recv_buffer[:] out[:] = pinned_out[:] return out
def testprog7(A: dace.float64[20, 20]): j = dace.define_local_scalar(dace.int64) with dace.tasklet: inp << A[1, 1] out >> j out = inp A[:] += j
def comp_all(j: _[0:N], i: _[0:M]): temp2 = dace.define_local_scalar(datatype) @dace.tasklet def reset_tmp(): tmp >> temp2 tmp = 0 @dace.map def comp_t2(k: _[0:i]): ialpha << alpha ia << A[i, k] ibi << B[i, j] ibk << B[k, j] oc >> C(1, lambda a, b: a + b)[k, j] ot2 >> temp2(1, lambda a, b: a + b) oc = ialpha * ibi * ia ot2 = ibk * ia @dace.tasklet def comp_rest(): ibeta << beta ib << B[i, j] iadiag << A[i, i] ialpha << alpha it2 << temp2 ic << C[i, j] oc >> C[i, j] oc = ibeta * ic + ialpha * ib * iadiag + ialpha * it2
def scaltest(A: dace.float64[20, 20]): scal = dace.define_local_scalar(dace.float64) for _ in dace.map[0:1]: with dace.tasklet: inp << A[1, 1] out >> scal out = inp + 5 return scal
def exchange(arr: d_float[lNy + 2, N]): group_handle0 = dace.define_local_scalar(d_int) group_handle1 = dace.define_local_scalar(d_int) # recv North dace.comm.nccl.Send(arr[1], peer=north_neighbor, group_handle=group_handle0) # recv South dace.comm.nccl.Recv(arr[-1], peer=south_neighbor, group_handle=group_handle0) # send South dace.comm.nccl.Send(arr[-2], peer=south_neighbor, group_handle=group_handle1) # recv North dace.comm.nccl.Recv(arr[0], peer=north_neighbor, group_handle=group_handle1)
def keyword_while(A: dace.float32[N], B: dace.float32[N]): i = dace.define_local_scalar(dtype=dace.int32) i = 0 while True: B[i] = A[i] + i - i i += 1 if i < N: continue else: break
def program(A: dace.float32[N], B: dace.float32[N]): for i in dace.map[0:N]: scal = dace.define_local_scalar(dace.float32) with dace.tasklet: a << A[0] x_out >> scal x_out = a with dace.tasklet: x_in << scal b >> B[i] b = x_in
def program(A: dace.float32[N], B: dace.float32[N]): for i in dace.map[0:N]: scal = dace.define_local_scalar(dace.float32) with dace.tasklet: a << A[i] x_out >> scal x_out = a # x_out and scal should be a vector with dace.tasklet: x_in << scal b >> B[i] b = x_in
def tasklets_only_reuse(A: dace.float32[1], C: dace.float32[1]): tmp_a = dace.define_local_scalar(dace.float32) tmp_b = dace.define_local_scalar(dace.float32) with dace.tasklet: a << A[0] a_out >> tmp_a a_out = sqrt(a) with dace.tasklet: a << A[0] a_out >> tmp_b a_out = log(a + 1) with dace.tasklet: a << tmp_a b << tmp_b c >> C[0] c = a * b
def keyword_return(A: dace.float32[N]): i = dace.define_local_scalar(dtype=dace.int32) i = 0 B = dace.define_local((N, ), dtype=dace.float32) while True: B[i] = A[i] + i - i i += 1 if i < N: continue else: break return B
def transients(A: dace.float32[10]): ostream = dace.define_stream(dace.float32, 10) oscalar = dace.define_local_scalar(dace.int32) oarray = dace.define_local([10], dace.float32) oarray[:] = 0 oscalar = 0 for i in dace.map[0:10]: if A[i] >= 0.5: A[i] >> ostream(-1) oscalar += 1 ostream >> oarray return oscalar, oarray
def transients(A: dace.float32[n]): ostream = dace.define_stream(dace.float32, n) oscalar = dace.define_local_scalar(dace.int32) oarray = dace.define_local([n], dace.float32) oarray[:] = 0 oscalar = 0 for i in dace.map[0:n]: if A[i] >= 0.5: A[i] >> ostream(-1) with dace.tasklet: out >> oscalar(1, lambda a, b: a + b) out = 1 ostream >> oarray return oscalar, oarray
def program(A: dace.float32[N], B: dace.float32[N]): for i in dace.map[0:N]: arr = dace.define_local(N, dace.float32) scal = dace.define_local_scalar(dace.float32) with dace.tasklet: a << A[0] x_out >> arr[i] y_out >> scal x_out = a # x_out looks like a scalar, but must be a vector (broadcast within Tasklet) y_out = a with dace.tasklet: x_in << arr[0] y_in << scal b >> B[i] b = x_in * y_in
def shiloach_vishkin(EL, comp): flag_hook = dace.define_local_scalar(dace.int32) @dace.tasklet def initflag(): out >> flag_hook out = 1 @dace.map(_[0:V]) def init(v): out >> comp[v] out = v while flag_hook: @dace.tasklet def resetflag(): out >> flag_hook out = 0 @dace.map(_[0:2 * E]) def hook(e): u << EL[e, 0] v << EL[e, 1] parents << comp(3)[:] out >> comp(1)[:] f >> flag_hook(-1) pu = parents[u] pv = parents[v] ppv = parents[pv] if pu < pv and pv == ppv: out[ppv] = pu f = 1 # Multi-jump version @dace.map(_[0:V]) def shortcut(v): inp << comp(-1)[0:v + 1] out >> comp(-1)[v] p = inp[v] pp = inp[p] while p != pp: out = pp p = pp pp = inp[p]
def testprog1(A: dace.float32[20, 20], scal: dace.float32): tmp = dace.ndarray([20, 20], dtype=dace.float32) m = dace.define_local_scalar(dace.float32) j = dace.ndarray([1], dtype=dace.int64) i = 1 i = 2 j[:] = 0 while j[0] < 5: tmp[:] = A + j for k in dace.map[0:20]: with dace.tasklet: inp << scal out >> m(1, lambda a, b: a + b) out = inp j += 1 i += j
def shiloach_vishkin(EL, comp): flag_hook = dace.define_local_scalar(dace.int32) with dace.tasklet: out >> flag_hook out = 1 for v in dace.map[0:V]: with dace.tasklet: out >> comp[v] out = v while flag_hook: with dace.tasklet: out >> flag_hook out = 0 for e in dace.map[0:2 * E]: with dace.tasklet: u << EL[e, 0] v << EL[e, 1] parents << comp(3)[:] out >> comp(1)[:] f >> flag_hook(-1) pu = parents[u] pv = parents[v] ppv = parents[pv] if pu < pv and pv == ppv: out[ppv] = pu f = 1 # Multi-jump version for v in dace.map[0:V]: with dace.tasklet: inp << comp(-1)[0:v + 1] out >> comp(-1)[v] p = inp[v] pp = inp[p] while p != pp: out = pp p = pp pp = inp[p]
def nccl_ring_exchange(): out = dace.ndarray([num_gpus], dtype) pinned_out = dace.ndarray([num_gpus, 2], dtype, storage=dace.StorageType.CPU_Pinned) for gpu_id in dace.map[0:num_gpus]: # Transients send_buffer = dace.ndarray([2], dtype, storage=dace.StorageType.GPU_Global) recv_buffer = dace.ndarray([2], dtype, storage=dace.StorageType.GPU_Global) ring_sum = dace.ndarray([2], dtype, storage=dace.StorageType.GPU_Global) # Init transients for i in dace.map[0:2]: send_buffer[i] = gpu_id ring_sum[i] = 0 # Ring Exchange group_handle = dace.define_local_scalar(dace.int32) for i in range(num_gpus): if gpu_id == 0: dace.comm.nccl.Send(send_buffer, gpu_id + 1, group_handle) dace.comm.nccl.Recv(recv_buffer, num_gpus - 1, group_handle) elif gpu_id == num_gpus - 1: dace.comm.nccl.Send(send_buffer, 0, group_handle) dace.comm.nccl.Recv(recv_buffer, gpu_id - 1, group_handle) else: dace.comm.nccl.Send(send_buffer, gpu_id + 1, group_handle) dace.comm.nccl.Recv(recv_buffer, gpu_id - 1, group_handle) for i in dace.map[0:2]: ring_sum[i] = recv_buffer[i] + ring_sum[i] send_buffer[i] = recv_buffer[i] pinned_out[gpu_id, :] = ring_sum[:] out[:] = pinned_out[:, 0] return out
def computecol(i: _[0:M]): tmp = dace.define_local_scalar(datatype) @dace.tasklet def reset_tmp(): out >> tmp out = 0 @dace.map def compute_elem(k: _[i + 1:M]): ia << A[k, i] ib << B[k, j] ob >> tmp(1, lambda a, b: a + b) ob = ia * ib @dace.tasklet def mult(): ib << B[i, j] ialpha << alpha itmp << tmp ob >> B[i, j] ob = ialpha * (ib + itmp)
def perscal(a: dace.float64[20]): tmp = dace.define_local_scalar( dace.float64, lifetime=dace.AllocationLifetime.Persistent) tmp[:] = a[1] + 1 return tmp
def cast_scalar_on_gpu(inp: dace.float64): output = dace.define_local_scalar(dace.float32) donnx.ONNXCast(input=inp, output=output, to=to_int) output_unsqueeze = dace.define_local([1], dace.float32) output_unsqueeze[0] = output return output_unsqueeze
def j_o_un( A: d_float[Ny, N], B: d_float[Ny, N], # lAs: d_float[size, lNy + 2, N], # lAe: d_float[size, lNy + 2, N], # lBb: d_float[size, lNy + 2, N], # lBe: d_float[size, lNy + 2, N], # lBi: d_float[size, lNy + 2, N], ): for rank in dace.map[0:size]: # Local extended domain lA = np.zeros((lNy + 2, N), dtype=A.dtype) lB = np.zeros((lNy + 2, N), dtype=B.dtype) north_neighbor = ((rank - 1) % size) south_neighbor = ((rank + 1) % size) lA[1:-1, :] = A[rank * lNy:(rank + 1) * lNy, :] # lB[1:-1, :] = B[rank * lNy:(rank + 1) * lNy, :] # lAs[rank] = lA[:] group_handle0 = dace.define_local_scalar(d_int) group_handle1 = dace.define_local_scalar(d_int) group_handle2 = dace.define_local_scalar(d_int) group_handle3 = dace.define_local_scalar(d_int) group_handle4 = dace.define_local_scalar(d_int) group_handle5 = dace.define_local_scalar(d_int) dace.comm.nccl.Send(lA[1], peer=north_neighbor, group_handle=group_handle0) dace.comm.nccl.Recv(lA[-1], peer=south_neighbor, group_handle=group_handle0) dace.comm.nccl.Send(lA[-2], peer=south_neighbor, group_handle=group_handle1) dace.comm.nccl.Recv(lA[0], peer=north_neighbor, group_handle=group_handle1) for t in range(TSTEPS): # comp_boundary(lA, lB) # North boundary lB[1, 1:-1] = 0.2 * (lA[1, 1:-1] + lA[1, :-2] + lA[1, 2:] + lA[2, 1:-1] + lA[0, 1:-1]) # South boundary lB[-2, 1:-1] = 0.2 * (lA[-2, 1:-1] + lA[-2, :-2] + lA[-2, 2:] + lA[-1, 1:-1] + lA[-3, 1:-1]) # lBb[rank] = lB[:] # recv North dace.comm.nccl.Send(lB[1], peer=north_neighbor, group_handle=group_handle2) # recv South dace.comm.nccl.Recv(lB[-1], peer=south_neighbor, group_handle=group_handle2) # send South dace.comm.nccl.Send(lB[-2], peer=south_neighbor, group_handle=group_handle3) # recv North dace.comm.nccl.Recv(lB[0], peer=north_neighbor, group_handle=group_handle3) # exchange(lB, rank=rank, size=size) # lBe[rank] = lB[:] # comp_interior(lA, lB) lB[2:-2, 1:-1] = 0.2 * (lA[2:-2, 1:-1] + lA[2:-2, :-2] + lA[2:-2, 2:] + lA[3:-1, 1:-1] + lA[1:-3, 1:-1]) # lBi[rank] = lB[:] # comp_boundary(lB, lA) # North boundary lA[1, 1:-1] = 0.2 * (lB[1, 1:-1] + lB[1, :-2] + lB[1, 2:] + lB[2, 1:-1] + lB[0, 1:-1]) # South boundary lA[-2, 1:-1] = 0.2 * (lB[-2, 1:-1] + lB[-2, :-2] + lB[-2, 2:] + lB[-1, 1:-1] + lB[-3, 1:-1]) # if t < TSTEPS - 1: # recv North dace.comm.nccl.Send(lA[1], peer=north_neighbor, group_handle=group_handle4) # recv South dace.comm.nccl.Recv(lA[-1], peer=south_neighbor, group_handle=group_handle4) # send South dace.comm.nccl.Send(lA[-2], peer=south_neighbor, group_handle=group_handle5) # recv North dace.comm.nccl.Recv(lA[0], peer=north_neighbor, group_handle=group_handle5) # comp_interior(lB, lA) lA[2:-2, 1:-1] = 0.2 * (lB[2:-2, 1:-1] + lB[2:-2, :-2] + lB[2:-2, 2:] + lB[3:-1, 1:-1] + lB[1:-3, 1:-1]) A[rank * lNy:(rank + 1) * lNy] = lA[1:-1, :] B[rank * lNy:(rank + 1) * lNy] = lB[1:-1, :]
def inner_view_forwarding(inp: dace.float64[9], bias: dace.float64[3], target_shape: dace.int64[2]): result = dace.define_local_scalar(dace.float64) sdfg(inp=inp, bias=bias, target_shape=target_shape, result=result) return result + 1