예제 #1
0
    def tasklets_multioutput(A: dace.float32[1], B: dace.float32[1],
                             C: dace.float32[1]):
        tmp_a = dace.define_local_scalar(dace.float32)
        tmp_b = dace.define_local_scalar(dace.float32)
        tmp_d = dace.define_local_scalar(dace.float32)

        with dace.tasklet:
            a << A[0]
            a_out >> tmp_a

            a_out = sqrt(a)

        with dace.tasklet:
            b << B[0]
            b_out >> tmp_b
            d_out >> tmp_d

            b_out = log(b + 1)
            d_out = b

        with dace.tasklet:
            a << tmp_a
            b << tmp_b
            d << tmp_d
            c >> C[0]
            c = a * b * d
예제 #2
0
파일: spmv.py 프로젝트: cpenny42/dace
    def compute_row(i):
        rowptr = dace.define_local_scalar(dace.uint32)
        rowend = dace.define_local_scalar(dace.uint32)
        rowptr << A_row[i]
        rowend << A_row[i + 1]

        @dace.map(_[rowptr:rowend])
        def compute(j):
            a << A_val[j]
            in_x << x[A_col[j]]
            out >> b(1, lambda x, y: x + y, 0)[i]

            out = a * in_x
예제 #3
0
def nccl_send_recv():
    out = dace.ndarray([num_gpus, 2], dtype)
    pinned_out = dace.ndarray([num_gpus, 2],
                              dtype,
                              storage=dace.StorageType.CPU_Pinned)
    for gpu_id in dace.map[0:num_gpus]:
        # Transients
        send_buffer = dace.ndarray([2],
                                   dtype,
                                   storage=dace.StorageType.GPU_Global)
        recv_buffer = dace.ndarray([2],
                                   dtype,
                                   storage=dace.StorageType.GPU_Global)

        # Init transients
        for i in dace.map[0:2]:
            send_buffer[i] = gpu_id
        group_handle = dace.define_local_scalar(
            dace.int32, storage=dace.StorageType.GPU_Global)
        if gpu_id == 0:
            dace.comm.nccl.Send(send_buffer, 1, group_handle=group_handle)
            dace.comm.nccl.Recv(recv_buffer, 1, group_handle=group_handle)
        else:
            dace.comm.nccl.Send(send_buffer, 0, group_handle=group_handle)
            dace.comm.nccl.Recv(recv_buffer, 0, group_handle=group_handle)

        pinned_out[gpu_id, :] = recv_buffer[:]

    out[:] = pinned_out[:]
    return out
예제 #4
0
 def testprog7(A: dace.float64[20, 20]):
     j = dace.define_local_scalar(dace.int64)
     with dace.tasklet:
         inp << A[1, 1]
         out >> j
         out = inp
     A[:] += j
예제 #5
0
파일: symm.py 프로젝트: cpenny42/dace
    def comp_all(j: _[0:N], i: _[0:M]):
        temp2 = dace.define_local_scalar(datatype)

        @dace.tasklet
        def reset_tmp():
            tmp >> temp2
            tmp = 0

        @dace.map
        def comp_t2(k: _[0:i]):
            ialpha << alpha
            ia << A[i, k]
            ibi << B[i, j]
            ibk << B[k, j]
            oc >> C(1, lambda a, b: a + b)[k, j]
            ot2 >> temp2(1, lambda a, b: a + b)

            oc = ialpha * ibi * ia
            ot2 = ibk * ia

        @dace.tasklet
        def comp_rest():
            ibeta << beta
            ib << B[i, j]
            iadiag << A[i, i]
            ialpha << alpha
            it2 << temp2
            ic << C[i, j]
            oc >> C[i, j]
            oc = ibeta * ic + ialpha * ib * iadiag + ialpha * it2
예제 #6
0
 def scaltest(A: dace.float64[20, 20]):
     scal = dace.define_local_scalar(dace.float64)
     for _ in dace.map[0:1]:
         with dace.tasklet:
             inp << A[1, 1]
             out >> scal
             out = inp + 5
     return scal
예제 #7
0
def exchange(arr: d_float[lNy + 2, N]):
    group_handle0 = dace.define_local_scalar(d_int)
    group_handle1 = dace.define_local_scalar(d_int)
    # recv North
    dace.comm.nccl.Send(arr[1],
                        peer=north_neighbor,
                        group_handle=group_handle0)
    # recv South
    dace.comm.nccl.Recv(arr[-1],
                        peer=south_neighbor,
                        group_handle=group_handle0)
    # send South
    dace.comm.nccl.Send(arr[-2],
                        peer=south_neighbor,
                        group_handle=group_handle1)
    # recv North
    dace.comm.nccl.Recv(arr[0],
                        peer=north_neighbor,
                        group_handle=group_handle1)
예제 #8
0
def keyword_while(A: dace.float32[N], B: dace.float32[N]):
    i = dace.define_local_scalar(dtype=dace.int32)
    i = 0
    while True:
        B[i] = A[i] + i - i
        i += 1
        if i < N:
            continue
        else:
            break
예제 #9
0
 def program(A: dace.float32[N], B: dace.float32[N]):
     for i in dace.map[0:N]:
         scal = dace.define_local_scalar(dace.float32)
         with dace.tasklet:
             a << A[0]
             x_out >> scal
             x_out = a
         with dace.tasklet:
             x_in << scal
             b >> B[i]
             b = x_in
예제 #10
0
 def program(A: dace.float32[N], B: dace.float32[N]):
     for i in dace.map[0:N]:
         scal = dace.define_local_scalar(dace.float32)
         with dace.tasklet:
             a << A[i]
             x_out >> scal
             x_out = a  # x_out and scal should be a vector
         with dace.tasklet:
             x_in << scal
             b >> B[i]
             b = x_in
예제 #11
0
    def tasklets_only_reuse(A: dace.float32[1], C: dace.float32[1]):
        tmp_a = dace.define_local_scalar(dace.float32)
        tmp_b = dace.define_local_scalar(dace.float32)

        with dace.tasklet:
            a << A[0]
            a_out >> tmp_a

            a_out = sqrt(a)

        with dace.tasklet:
            a << A[0]
            a_out >> tmp_b

            a_out = log(a + 1)

        with dace.tasklet:
            a << tmp_a
            b << tmp_b
            c >> C[0]
            c = a * b
예제 #12
0
def keyword_return(A: dace.float32[N]):
    i = dace.define_local_scalar(dtype=dace.int32)
    i = 0
    B = dace.define_local((N, ), dtype=dace.float32)
    while True:
        B[i] = A[i] + i - i
        i += 1
        if i < N:
            continue
        else:
            break
    return B
예제 #13
0
def transients(A: dace.float32[10]):
    ostream = dace.define_stream(dace.float32, 10)
    oscalar = dace.define_local_scalar(dace.int32)
    oarray = dace.define_local([10], dace.float32)
    oarray[:] = 0
    oscalar = 0
    for i in dace.map[0:10]:
        if A[i] >= 0.5:
            A[i] >> ostream(-1)
            oscalar += 1
    ostream >> oarray
    return oscalar, oarray
예제 #14
0
def transients(A: dace.float32[n]):
    ostream = dace.define_stream(dace.float32, n)
    oscalar = dace.define_local_scalar(dace.int32)
    oarray = dace.define_local([n], dace.float32)
    oarray[:] = 0
    oscalar = 0
    for i in dace.map[0:n]:
        if A[i] >= 0.5:
            A[i] >> ostream(-1)
            with dace.tasklet:
                out >> oscalar(1, lambda a, b: a + b)
                out = 1
    ostream >> oarray
    return oscalar, oarray
예제 #15
0
 def program(A: dace.float32[N], B: dace.float32[N]):
     for i in dace.map[0:N]:
         arr = dace.define_local(N, dace.float32)
         scal = dace.define_local_scalar(dace.float32)
         with dace.tasklet:
             a << A[0]
             x_out >> arr[i]
             y_out >> scal
             x_out = a  # x_out looks like a scalar, but must be a vector (broadcast within Tasklet)
             y_out = a
         with dace.tasklet:
             x_in << arr[0]
             y_in << scal
             b >> B[i]
             b = x_in * y_in
예제 #16
0
def shiloach_vishkin(EL, comp):
    flag_hook = dace.define_local_scalar(dace.int32)

    @dace.tasklet
    def initflag():
        out >> flag_hook
        out = 1

    @dace.map(_[0:V])
    def init(v):
        out >> comp[v]
        out = v

    while flag_hook:

        @dace.tasklet
        def resetflag():
            out >> flag_hook
            out = 0

        @dace.map(_[0:2 * E])
        def hook(e):
            u << EL[e, 0]
            v << EL[e, 1]
            parents << comp(3)[:]
            out >> comp(1)[:]
            f >> flag_hook(-1)

            pu = parents[u]
            pv = parents[v]
            ppv = parents[pv]

            if pu < pv and pv == ppv:
                out[ppv] = pu
                f = 1

        # Multi-jump version
        @dace.map(_[0:V])
        def shortcut(v):
            inp << comp(-1)[0:v + 1]
            out >> comp(-1)[v]

            p = inp[v]
            pp = inp[p]
            while p != pp:
                out = pp
                p = pp
                pp = inp[p]
예제 #17
0
 def testprog1(A: dace.float32[20, 20], scal: dace.float32):
     tmp = dace.ndarray([20, 20], dtype=dace.float32)
     m = dace.define_local_scalar(dace.float32)
     j = dace.ndarray([1], dtype=dace.int64)
     i = 1
     i = 2
     j[:] = 0
     while j[0] < 5:
         tmp[:] = A + j
         for k in dace.map[0:20]:
             with dace.tasklet:
                 inp << scal
                 out >> m(1, lambda a, b: a + b)
                 out = inp
         j += 1
         i += j
예제 #18
0
def shiloach_vishkin(EL, comp):
    flag_hook = dace.define_local_scalar(dace.int32)

    with dace.tasklet:
        out >> flag_hook
        out = 1

    for v in dace.map[0:V]:
        with dace.tasklet:
            out >> comp[v]
            out = v

    while flag_hook:

        with dace.tasklet:
            out >> flag_hook
            out = 0

        for e in dace.map[0:2 * E]:
            with dace.tasklet:
                u << EL[e, 0]
                v << EL[e, 1]
                parents << comp(3)[:]
                out >> comp(1)[:]
                f >> flag_hook(-1)

                pu = parents[u]
                pv = parents[v]
                ppv = parents[pv]

                if pu < pv and pv == ppv:
                    out[ppv] = pu
                    f = 1

        # Multi-jump version
        for v in dace.map[0:V]:
            with dace.tasklet:
                inp << comp(-1)[0:v + 1]
                out >> comp(-1)[v]

                p = inp[v]
                pp = inp[p]
                while p != pp:
                    out = pp
                    p = pp
                    pp = inp[p]
예제 #19
0
def nccl_ring_exchange():
    out = dace.ndarray([num_gpus], dtype)
    pinned_out = dace.ndarray([num_gpus, 2],
                              dtype,
                              storage=dace.StorageType.CPU_Pinned)
    for gpu_id in dace.map[0:num_gpus]:
        # Transients
        send_buffer = dace.ndarray([2],
                                   dtype,
                                   storage=dace.StorageType.GPU_Global)
        recv_buffer = dace.ndarray([2],
                                   dtype,
                                   storage=dace.StorageType.GPU_Global)
        ring_sum = dace.ndarray([2], dtype, storage=dace.StorageType.GPU_Global)

        # Init transients
        for i in dace.map[0:2]:
            send_buffer[i] = gpu_id
            ring_sum[i] = 0

        # Ring Exchange
        group_handle = dace.define_local_scalar(dace.int32)
        for i in range(num_gpus):
            if gpu_id == 0:
                dace.comm.nccl.Send(send_buffer, gpu_id + 1, group_handle)
                dace.comm.nccl.Recv(recv_buffer, num_gpus - 1, group_handle)
            elif gpu_id == num_gpus - 1:
                dace.comm.nccl.Send(send_buffer, 0, group_handle)
                dace.comm.nccl.Recv(recv_buffer, gpu_id - 1, group_handle)
            else:
                dace.comm.nccl.Send(send_buffer, gpu_id + 1, group_handle)
                dace.comm.nccl.Recv(recv_buffer, gpu_id - 1, group_handle)

            for i in dace.map[0:2]:
                ring_sum[i] = recv_buffer[i] + ring_sum[i]
                send_buffer[i] = recv_buffer[i]

        pinned_out[gpu_id, :] = ring_sum[:]

    out[:] = pinned_out[:, 0]
    return out
예제 #20
0
파일: trmm.py 프로젝트: zurvar/dace
        def computecol(i: _[0:M]):
            tmp = dace.define_local_scalar(datatype)

            @dace.tasklet
            def reset_tmp():
                out >> tmp
                out = 0

            @dace.map
            def compute_elem(k: _[i + 1:M]):
                ia << A[k, i]
                ib << B[k, j]
                ob >> tmp(1, lambda a, b: a + b)
                ob = ia * ib

            @dace.tasklet
            def mult():
                ib << B[i, j]
                ialpha << alpha
                itmp << tmp
                ob >> B[i, j]
                ob = ialpha * (ib + itmp)
예제 #21
0
 def perscal(a: dace.float64[20]):
     tmp = dace.define_local_scalar(
         dace.float64, lifetime=dace.AllocationLifetime.Persistent)
     tmp[:] = a[1] + 1
     return tmp
예제 #22
0
 def cast_scalar_on_gpu(inp: dace.float64):
     output = dace.define_local_scalar(dace.float32)
     donnx.ONNXCast(input=inp, output=output, to=to_int)
     output_unsqueeze = dace.define_local([1], dace.float32)
     output_unsqueeze[0] = output
     return output_unsqueeze
예제 #23
0
def j_o_un(
    A: d_float[Ny, N],
    B: d_float[Ny, N],
    # lAs: d_float[size, lNy + 2, N],
    # lAe: d_float[size, lNy + 2, N],
    # lBb: d_float[size, lNy + 2, N],
    # lBe: d_float[size, lNy + 2, N],
    # lBi: d_float[size, lNy + 2, N],
):

    for rank in dace.map[0:size]:
        # Local extended domain
        lA = np.zeros((lNy + 2, N), dtype=A.dtype)
        lB = np.zeros((lNy + 2, N), dtype=B.dtype)
        north_neighbor = ((rank - 1) % size)
        south_neighbor = ((rank + 1) % size)

        lA[1:-1, :] = A[rank * lNy:(rank + 1) * lNy, :]
        # lB[1:-1, :] = B[rank * lNy:(rank + 1) * lNy, :]
        # lAs[rank] = lA[:]
        group_handle0 = dace.define_local_scalar(d_int)
        group_handle1 = dace.define_local_scalar(d_int)
        group_handle2 = dace.define_local_scalar(d_int)
        group_handle3 = dace.define_local_scalar(d_int)
        group_handle4 = dace.define_local_scalar(d_int)
        group_handle5 = dace.define_local_scalar(d_int)

        dace.comm.nccl.Send(lA[1],
                            peer=north_neighbor,
                            group_handle=group_handle0)

        dace.comm.nccl.Recv(lA[-1],
                            peer=south_neighbor,
                            group_handle=group_handle0)

        dace.comm.nccl.Send(lA[-2],
                            peer=south_neighbor,
                            group_handle=group_handle1)

        dace.comm.nccl.Recv(lA[0],
                            peer=north_neighbor,
                            group_handle=group_handle1)

        for t in range(TSTEPS):
            # comp_boundary(lA, lB)

            # North boundary
            lB[1, 1:-1] = 0.2 * (lA[1, 1:-1] + lA[1, :-2] + lA[1, 2:] +
                                 lA[2, 1:-1] + lA[0, 1:-1])
            # South boundary
            lB[-2, 1:-1] = 0.2 * (lA[-2, 1:-1] + lA[-2, :-2] + lA[-2, 2:] +
                                  lA[-1, 1:-1] + lA[-3, 1:-1])

            # lBb[rank] = lB[:]
            # recv North
            dace.comm.nccl.Send(lB[1],
                                peer=north_neighbor,
                                group_handle=group_handle2)
            # recv South
            dace.comm.nccl.Recv(lB[-1],
                                peer=south_neighbor,
                                group_handle=group_handle2)
            # send South
            dace.comm.nccl.Send(lB[-2],
                                peer=south_neighbor,
                                group_handle=group_handle3)
            # recv North
            dace.comm.nccl.Recv(lB[0],
                                peer=north_neighbor,
                                group_handle=group_handle3)
            # exchange(lB, rank=rank, size=size)
            # lBe[rank] = lB[:]
            # comp_interior(lA, lB)
            lB[2:-2,
               1:-1] = 0.2 * (lA[2:-2, 1:-1] + lA[2:-2, :-2] + lA[2:-2, 2:] +
                              lA[3:-1, 1:-1] + lA[1:-3, 1:-1])

            # lBi[rank] = lB[:]

            # comp_boundary(lB, lA)
            # North boundary
            lA[1, 1:-1] = 0.2 * (lB[1, 1:-1] + lB[1, :-2] + lB[1, 2:] +
                                 lB[2, 1:-1] + lB[0, 1:-1])
            # South boundary
            lA[-2, 1:-1] = 0.2 * (lB[-2, 1:-1] + lB[-2, :-2] + lB[-2, 2:] +
                                  lB[-1, 1:-1] + lB[-3, 1:-1])
            # if t < TSTEPS - 1:

            # recv North
            dace.comm.nccl.Send(lA[1],
                                peer=north_neighbor,
                                group_handle=group_handle4)
            # recv South
            dace.comm.nccl.Recv(lA[-1],
                                peer=south_neighbor,
                                group_handle=group_handle4)
            # send South
            dace.comm.nccl.Send(lA[-2],
                                peer=south_neighbor,
                                group_handle=group_handle5)
            # recv North
            dace.comm.nccl.Recv(lA[0],
                                peer=north_neighbor,
                                group_handle=group_handle5)
            # comp_interior(lB, lA)
            lA[2:-2,
               1:-1] = 0.2 * (lB[2:-2, 1:-1] + lB[2:-2, :-2] + lB[2:-2, 2:] +
                              lB[3:-1, 1:-1] + lB[1:-3, 1:-1])
        A[rank * lNy:(rank + 1) * lNy] = lA[1:-1, :]
        B[rank * lNy:(rank + 1) * lNy] = lB[1:-1, :]
예제 #24
0
 def inner_view_forwarding(inp: dace.float64[9], bias: dace.float64[3],
                           target_shape: dace.int64[2]):
     result = dace.define_local_scalar(dace.float64)
     sdfg(inp=inp, bias=bias, target_shape=target_shape, result=result)
     return result + 1