Пример #1
0
def make_matrix_softmax_cross_entropy(shape,
                                      tgt,
                                      tgt_host,
                                      func_name,
                                      dtype="float32"):
    """Hint: output shape should be (1,)"""
    h, w = shape
    x = tvm.placeholder(shape, dtype, name="x")
    k1 = tvm.reduce_axis((0, w), name="k1")
    x2 = tvm.compute((h, ), lambda i: tvm.max(x[i, k1], axis=k1))
    x3 = tvm.compute(shape, lambda i, j: x[i, j] - x2[i])
    x4 = tvm.compute(shape, lambda i, j: tvm.exp(x3[i, j]))
    k2 = tvm.reduce_axis((0, w), name="k2")
    x5 = tvm.compute((h, ), lambda i: tvm.sum(x4[i, k2], axis=k2))
    log_p = tvm.compute(shape, lambda i, j: tvm.log(x4[i, j] / x5[i]))
    q = tvm.placeholder(shape, dtype, name="q")
    k3 = tvm.reduce_axis((0, w), name="k3")
    y1 = tvm.compute((h, ),
                     lambda i: tvm.sum(q[i, k3] * log_p[i, k3], axis=k3))
    k4 = tvm.reduce_axis((0, h), name="k4")
    y2 = tvm.compute((1, ), lambda i: tvm.sum(-y1[k4], axis=k4))
    y3 = tvm.compute((1, ), lambda i: y2[i] / h)

    s = tvm.create_schedule(y3.op)
    f = tvm.build(s, [x, q, y3], tgt, target_host=tgt_host, name=func_name)
    return f
Пример #2
0
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name,
                                      dtype="float32"):
    """TODO: Your code here"""
    """Hint: output shape should be (1,)"""
    assert len(shape) == 2
    x = tvm.placeholder(shape, dtype=dtype, name='x')
    y = tvm.placeholder(shape, dtype=dtype, name='y')

    n, cls_num = shape
    max_iter = tvm.reduce_axis((0, cls_num), name='max_iter')

    x_max = tvm.compute((n,), lambda i : tvm.max(x[i, max_iter], axis=max_iter))
    e_x = tvm.compute(shape, lambda i,j : tvm.exp(x[i,j] - x_max[i]))

    sum_ex_iter = tvm.reduce_axis((0, cls_num), name='sum_ex_iter')
    e_x_sum = tvm.compute((n,), lambda i : tvm.sum(e_x[i, sum_ex_iter], axis=sum_ex_iter))
    log_softmax = tvm.compute(shape, lambda i,j : tvm.log(e_x[i,j] /e_x_sum[i]))
    y_mul_log_softmax = tvm.compute(shape, lambda i,j: log_softmax[i,j] * y[i,j])

    sum_entropy_iter = tvm.reduce_axis((0, cls_num), name='sum_entropy_iter')
    mean_iter = tvm.reduce_axis((0, n), name='mean_iter')
    sum_entropy = tvm.compute((1,), lambda i:tvm.sum(y_mul_log_softmax[mean_iter, sum_entropy_iter], axis=[mean_iter, sum_entropy_iter]))

    scale = tvm.const(-n, dtype)
    entropy = tvm.compute((1,), lambda i: sum_entropy[i]/scale)


    s = tvm.create_schedule(entropy.op)
    f = tvm.build(s, [x, y, entropy], tgt, target_host=tgt_host, name=func_name)
    return f
Пример #3
0
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name,
                                      dtype="float32"):
    """TODO: Your code here"""
    """Hint: output shape should be (1,)"""
    X = tvm.placeholder(shape, dtype=dtype, name="X")
    _, _, _, SOFTMAX = softmax_computation(X, shape, dtype)

    # cross_entropy = np.mean(
        # -np.sum(y_ * np.log(autodiff.softmax_func(y)), axis=1), keepdims=True)

    Y = tvm.placeholder(shape, dtype=dtype, name="Y")

    MUL_LOG = tvm.compute(shape, lambda *i: Y(*i) * tvm.log(SOFTMAX(*i)), name="MUL_LOG")

    k1 = tvm.reduce_axis((0, shape[1]), name="k1")
    SUM1 = tvm.compute((shape[0],), lambda i: tvm.sum(-MUL_LOG(i, k1), axis=k1), name="SUM1")

    k2 = tvm.reduce_axis((0, shape[0]), name="k2")
    SUM2 = tvm.compute((1,), lambda _: tvm.sum(SUM1(k2), axis=k2), name="SUM2")

    OUT = tvm.compute((1,), lambda i: SUM2(i) / shape[0])

    # schedule
    s = tvm.create_schedule(OUT.op)

    # compile
    f = tvm.build(s, [X, Y, OUT], tgt, target_host=tgt_host, name=func_name)

    return f
Пример #4
0
def test_log_pow_llvm():
    # graph
    n = tvm.var('n')
    A = tvm.placeholder((n, ), name='A')
    B = tvm.compute(A.shape,
                    lambda *i: tvm.power(tvm.log(A(*i)), 2.0),
                    name='B')
    s = tvm.create_schedule(B.op)
    # create iter var and assign them tags.
    bx, tx = s[B].split(B.op.axis[0], factor=32)
    # one line to build the function.
    if not tvm.module.enabled("llvm"):
        return

    flog = tvm.build(s, [A, B], "llvm", name="mylog")
    ctx = tvm.cpu(0)
    # launch the kernel.
    n = 1028
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
    b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
    repeat = 10
    ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat)
    res = ftimer(a, b)
    assert (len(res.results) == repeat)
    np.testing.assert_allclose(b.asnumpy(),
                               np.power(np.log(a.asnumpy()), 2.0),
                               rtol=1e-5)
Пример #5
0
def test_log_pow_llvm():
    # graph
    n = tvm.var('n')
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute(A.shape, lambda *i: tvm.power(tvm.log(A(*i)), 2.0), name='B')
    s = tvm.create_schedule(B.op)
    # create iter var and assign them tags.
    bx, tx = s[B].split(B.op.axis[0], factor=32)
    # one line to build the function.
    if not tvm.module.enabled("llvm"):
        return

    flog = tvm.build(s, [A, B],
                     "llvm", name="mylog")
    ctx = tvm.cpu(0)
    # launch the kernel.
    n = 1028
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
    b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
    repeat = 10
    ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat)
    res = ftimer(a, b)
    assert(len(res.results) == repeat)
    np.testing.assert_allclose(
        b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
Пример #6
0
def test_1():
    m = 64
    n = 128
    shape = (64, 128)
    A = tvm.placeholder(shape, name="A")
    C = tvm.compute(shape, lambda *indice: tvm.log(A(*indice)), name="C")
    s = tvm.create_schedule(C.op)
    print(tvm.lower(s, [A, C], simple_mode=True))

    pass
Пример #7
0
def log(x):
    """Take logarithm of input x.

    Parameters
    ----------
    x : tvm.Tensor
        Input argument.

    Returns
    -------
    y : tvm.Tensor
        The result.
    """
    return tvm.compute(x.shape, lambda *i: tvm.log(x(*i)))
Пример #8
0
Файл: math.py Проект: gwli/tvm
def log(x):
    """Take logarithm of input x.

    Parameters
    ----------
    x : tvm.Tensor
        Input argument.

    Returns
    -------
    y : tvm.Tensor
        The result.
    """
    return tvm.compute(x.shape, lambda *i: tvm.log(x(*i)))
def make_matrix_softmax_cross_entropy(shape,
                                      tgt,
                                      tgt_host,
                                      func_name,
                                      dtype="float32"):
    X = tvm.placeholder(shape, dtype=dtype, name="X")
    Y_orig = tvm.placeholder(shape, dtype=dtype, name="Y_orig")

    j1 = tvm.reduce_axis((0, shape[1]), "j1")
    j2 = tvm.reduce_axis((0, shape[1]), "j2")

    maxX = tvm.compute((shape[0], ),
                       lambda i: tvm.max(X[i, j1], axis=j1),
                       name="maxX")

    numerator = tvm.compute(shape,
                            lambda i, j: tvm.exp(X[i, j] - maxX[i]),
                            name="numerator")

    denominator = tvm.compute((shape[0], ),
                              lambda i: tvm.sum(numerator[i, j2], axis=j2),
                              name="denominator")

    m1 = tvm.reduce_axis((0, shape[0]), "m1")
    m2 = tvm.reduce_axis((0, shape[1]), "m2")

    cross_entropy_sum = tvm.compute((1, ),
                                    lambda i: tvm.sum(Y_orig[m1, m2] * tvm.log(
                                        numerator[m1, m2] / denominator[m1]),
                                                      axis=[m1, m2]),
                                    name="cross_entropy_sum")

    negated = tvm.compute((1, ),
                          lambda i: -cross_entropy_sum[i] / shape[0],
                          name="negated")

    s = tvm.create_schedule(negated.op)

    # print(tvm.lower(s, [X, Y_orig, negated], simple_mode=True))

    return tvm.build(s, [X, Y_orig, negated],
                     tgt,
                     target_host=tgt_host,
                     name=func_name)
Пример #10
0
    def algorithm_forward(self):
        assert self.x.size == self.t.size, \
            "only supports one-hot vector"

        self.softmax, self.max_elem, self.expsum = Softmax.forward_(
            self.x, name=f'{self.label}')

        if self.x.ndim == 1:
            y = topi.reshape(self.softmax, (1, self.x.size))
            t = topi.reshape(self.t.tvm_tensor, (1, self.t.size))
            m = 1
            n = self.x.shape[0]
        elif self.x.ndim == 2:
            y = self.softmax
            t = self.t.tvm_tensor
            m, n = self.x.shape
        else:
            raise NotImplementedError

        self.ty = tvm.compute((m, n),
                              lambda i, j: tvm.log(y[i, j]) * t[i, j],
                              name=f'{self.label}:ty')

        k = tvm.reduce_axis((0, n), name='k')
        self.sum_ty = tvm.compute((m, ),
                                  lambda i: tvm.sum(self.ty[i, k], axis=k),
                                  name=f'{self.label}:sum_ty')

        # TODO: need to validate the shape and keepdims
        # self.shape would be like (1,1,1), which size is 1
        expected_size = 1
        assert self.size == expected_size, \
            f'size of SoftmaxWithCrossEntropyLoss must be {expected_size}, not {self.size}'

        k = tvm.reduce_axis((0, m), name='k')
        self.total = tvm.compute(self.shape,
                                 lambda *idxs: tvm.sum(self.sum_ty[k], axis=k),
                                 name=f'{self.label}:total')

        self.tvm_tensor = tvm.compute(self.shape,
                                      lambda *idxs: -self.total[idxs] / m,
                                      name=f'{self.label}:tensor')
Пример #11
0
def make_matrix_softmax_cross_entropy(shape,
                                      tgt,
                                      tgt_host,
                                      func_name,
                                      dtype="float32"):
    """TODO: Your code here"""
    """Hint: output shape should be (1,)"""

    A = tvm.placeholder(shape, dtype=dtype, name="A")
    B = tvm.placeholder(shape, dtype=dtype, name="B")
    k = tvm.reduce_axis((0, shape[1]), name="k")
    max_A = tvm.compute((shape[0], ),
                        lambda i: tvm.max(A[i, k], axis=k),
                        name="max_A")
    exp = tvm.compute(shape,
                      lambda i, j: tvm.exp(A[i, j] - max_A[i]),
                      name="exp")
    k1 = tvm.reduce_axis((0, shape[1]), name="k1")
    sum_exp = tvm.compute((shape[0], ),
                          lambda i: tvm.sum(exp[i, k1], axis=k1),
                          name="sum_exp")
    softmax = tvm.compute(shape,
                          lambda i, j: exp[i, j] / sum_exp[i],
                          name="softmax")

    log = tvm.compute(shape, lambda i, j: tvm.log(softmax[i, j]), name="log")
    k2 = tvm.reduce_axis((0, shape[1]), name="k2")
    sum_softmax = tvm.compute(
        (shape[0], ),
        lambda i: tvm.sum(B[i, k2] * log[i, k2], axis=k2),
        name="sum_softmax")
    k3 = tvm.reduce_axis((0, shape[0]), name="k3")
    softmax_cross_entropy = tvm.compute(
        (1, ), lambda i: tvm.sum(-1 * sum_softmax[k3] / shape[0], axis=k3))

    s = tvm.create_schedule(softmax_cross_entropy.op)
    f = tvm.build(s, [A, B, softmax_cross_entropy],
                  tgt,
                  target_host=tgt_host,
                  name=func_name)
    return f
Пример #12
0
def log_softmax(x):
    """Perform log softmax activation on the data

    Parameters
    ----------
    data : tvm.Tensor
        2-D input data

    Returns
    -------
    output : tvm.Tensor
        2-D output with same shape
    """

    assert len(x.shape) == 2, "only support 2-dim log softmax"
    m, n = x.shape
    k = tvm.reduce_axis((0, n), name='k')
    max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k))
    k = tvm.reduce_axis((0, n), name='k')
    expsum = tvm.compute(
        (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k))
    return tvm.compute(x.shape,
                       lambda i, j: x[i, j] - max_elem[i] - tvm.log(expsum[i]))
Пример #13
0
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name,
                                      dtype="float32"):
    assert len(shape) == 2, "only 2d tensor is accepted for batched softmax xent loss"
    num_batch, num_class = shape
    logits = tvm.placeholder(shape, dtype, "logits")
    truth = tvm.placeholder(shape, dtype, "truth")
    k = tvm.reduce_axis((0, num_class), name="k")
    max_logits = tvm.compute((num_batch,), lambda i: tvm.max(logits[i, k], axis=k))
    logits_shifted = tvm.compute(shape, lambda i, j: logits[i, j] - max_logits[i])
    exps = tvm.compute(shape, lambda *i: tvm.exp(logits_shifted(*i)))
    k = tvm.reduce_axis((0, num_class), name="k")
    exps_sum = tvm.compute((num_batch,), lambda i: tvm.sum(exps[i, k], axis=k))
    neg_pred_log = tvm.compute(shape, lambda i,j: tvm.log(exps_sum[i]) - logits_shifted[i, j])
    ewise_prod = tvm.compute(shape, lambda *i: truth(*i) * neg_pred_log(*i))

    i = tvm.reduce_axis((0, num_batch), name="i")
    j = tvm.reduce_axis((0, num_class), name="j")
    ce_sum = tvm.compute((1,), lambda _: tvm.sum(ewise_prod[i, j], axis=[i, j]))
    ce_mean = tvm.compute((1,), lambda _: ce_sum[0] / tvm.const(num_batch, dtype))

    s = tvm.create_schedule(ce_mean.op)
    f = tvm.build(s, [logits, truth, ce_mean], tgt, tgt_host, func_name)
    return f
Пример #14
0
def log_softmax(x):
    """Perform log softmax activation on the data

    Parameters
    ----------
    data : tvm.Tensor
        2-D input data

    Returns
    -------
    output : tvm.Tensor
        2-D output with same shape
    """

    assert len(x.shape) == 2, "only support 2-dim log softmax"
    m, n = x.shape
    k = tvm.reduce_axis((0, n), name='k')
    max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k))
    k = tvm.reduce_axis((0, n), name='k')
    expsum = tvm.compute(
        (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k))
    return tvm.compute(
        x.shape, lambda i, j: x[i, j] - max_elem[i] - tvm.log(expsum[i]))
Пример #15
0
def test():
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (2, 16)
    a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host')
    print('a host ' + str(a_host))
    a = tvm.compute(shape, lambda *i: a_host(*i), name='a')
    a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf')
    b_buf = tvm.compute(
        shape,
        lambda i, j: tvm.log(a_buf[i, j].astype(env.cfg['dtype_w'])),
        name='b_buf')
    b = tvm.compute(shape, lambda *i: b_buf(*i), name='b')
    b_host = tvm.compute(shape, lambda *i: b(*i), name='b_host')

    s = tvm.create_schedule(b_host.op)

    # mark variable scopes
    s[a].set_scope(env.dram_scope)
    s[b].set_scope(env.dram_scope)

    s[a_buf].set_scope(env.uni_scratchpad_scope)
    s[b_buf].set_scope(env.uni_scratchpad_scope)

    #print
    # (dir(s[b].op.body))

    # mark compiler pragmas
    s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma)
    s[b_host].pragma(s[b_host].op.axis[0], env.dma_copy_pragma)

    s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls)
    s[b].pragma(s[b].op.axis[0], env.scratchpad_ls)

    s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0])

    # tensorize
    s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VLOG',
                                                            mode='inc'))

    # build
    print(tvm.lower(s, [a_host, b_host], simple_mode=True))
    print(nnpu.lower(s, [a_host, b_host], simple_mode=True))
    #exit()
    func = nnpu.build(s, [a_host, b_host], 'nnpu', 'llvm', name='nnpu_log')

    print('function built: ')
    #print(func.get_source())

    # prepare data
    ctx = tvm.nd.TVMContext(13, 0)  #???
    print('i want to know:')
    print(ctx.exist)
    a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=1, high=20)
    a_nd = tvm.nd.array(a_np, ctx)
    b_nd = tvm.nd.array(np.zeros(shape).astype(b_host.dtype), ctx)

    # run
    func(a_nd, b_nd)

    print('run finished')

    b_np = b_nd.asnumpy()
    print('a=')
    print(a_np)
    print('b=')
    print(b_np)
    print('ground truth =')
    gt = np.log(a_np, dtype=b_host.dtype)
    print(gt)
    np.testing.assert_allclose(b_np, gt)
Пример #16
0
def make_matrix_softmax_cross_entropy(shape,
                                      tgt,
                                      tgt_host,
                                      func_name,
                                      dtype="float32"):
    """TODO: Your code here"""
    """Hint: output shape should be (1,)"""
    X = tvm.placeholder(shape, dtype=dtype, name='X')
    a1 = tvm.reduce_axis((0, shape[1]), name='a1')
    MAX_X = tvm.compute((shape[0], ), lambda i: tvm.max(X[i, a1], axis=[a1]))

    E_X = tvm.compute(shape, lambda i, j: tvm.exp(X[i, j] - MAX_X(i)))

    a2 = tvm.reduce_axis((0, shape[1]), name='a2')
    E_X_SUM = tvm.compute((shape[0], ),
                          lambda i: tvm.sum(E_X[i, a2], axis=[a2]))

    SOFTMAX_X = tvm.compute(shape, lambda i, j: E_X[i, j] / E_X_SUM(i))

    LOG_SOFTMAX_X = tvm.compute(shape, lambda i, j: tvm.log(SOFTMAX_X[i, j]))

    X_P = tvm.placeholder(shape, dtype=dtype, name='X_P')

    MUL = tvm.compute(shape, lambda i, j: X_P[i, j] * LOG_SOFTMAX_X[i, j])

    a3 = tvm.reduce_axis((0, shape[1]), name='a3')
    SUM = tvm.compute((shape[0], ), lambda i: tvm.sum(-MUL[i, a3], axis=[a3]))

    a4 = tvm.reduce_axis((0, shape[0]), name='a4')
    MEAN = tvm.compute((1, ), lambda i: tvm.sum(SUM[a4] / shape[0], axis=[a4]))

    # s = tvm.create_schedule([MAX_X.op, E_X.op, E_X_SUM.op, SOFTMAX_X.op, LOG_SOFTMAX_X.op, MUL.op, SUM.op, MEAN.op])
    s = tvm.create_schedule(MEAN.op)

    # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True))

    # MAX_X
    s[MAX_X].bind(MAX_X.op.axis[0], tvm.thread_axis("blockIdx.x"))
    s[MAX_X].bind(a1, tvm.thread_axis("threadIdx.x"))

    # E_X
    s[E_X].bind(E_X.op.axis[0], tvm.thread_axis("blockIdx.x"))
    s[E_X].bind(E_X.op.axis[1], tvm.thread_axis("threadIdx.x"))

    # E_X_SUM
    s[E_X_SUM].bind(E_X_SUM.op.axis[0], tvm.thread_axis("blockIdx.x"))
    s[E_X_SUM].bind(a2, tvm.thread_axis("threadIdx.x"))

    # SOFTMAX_X
    s[SOFTMAX_X].bind(SOFTMAX_X.op.axis[0], tvm.thread_axis("blockIdx.x"))
    s[SOFTMAX_X].bind(SOFTMAX_X.op.axis[1], tvm.thread_axis("threadIdx.x"))

    # LOG_SOFT_MAX
    s[LOG_SOFTMAX_X].bind(LOG_SOFTMAX_X.op.axis[0],
                          tvm.thread_axis("blockIdx.x"))
    s[LOG_SOFTMAX_X].bind(LOG_SOFTMAX_X.op.axis[1],
                          tvm.thread_axis("threadIdx.x"))

    # MUL
    s[MUL].bind(MUL.op.axis[0], tvm.thread_axis("blockIdx.x"))
    s[MUL].bind(MUL.op.axis[1], tvm.thread_axis("threadIdx.x"))

    # SUM
    s[SUM].bind(SUM.op.axis[0], tvm.thread_axis("blockIdx.x"))
    s[SUM].bind(a3, tvm.thread_axis("threadIdx.x"))

    # MEAN
    # s[MEAN].bind(a4, tvm.thread_axis("blockIdx.x"))
    s[MEAN].bind(a4, tvm.thread_axis("threadIdx.x"))

    # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True))

    # block_x = tvm.thread_axis("blockIdx.x")
    # thread_x = tvm.thread_axis("threadIdx.x")

    # zo, zi = s[SUM].split(SUM.op.axis[0], 3)
    # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True))
    # s[SUM].bind(zo, block_x)
    # s[SUM].bind(zi, thread_x)

    f = tvm.build(s, [X, X_P, MEAN], tgt, target_host=tgt_host, name=func_name)
    return _export_module(f, func_name, remote)
Пример #17
0
                        default='S0', choices=['S0', 'S1', 'SC'])
args = parser.parse_args()

with ScheduleProcHelper(), nnpu.Environment('./nnpu_config_fp32.yaml'):
    env = nnpu.get_env()
    nnpu.set_device(env, type=args.sim)
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']

    assert dtype_w in ['float32', 'float16'], 'when testing activation function, float dtype is needed'

    shape = (64, )
    a = tvm.placeholder(shape, dtype_w, 'a')
    a_buf = tvm.compute(shape, lambda *i: a(*i), 'a_buf')

    exp = tvm.compute(shape, lambda i: tvm.exp(a_buf[i]), 'exp')
    log = tvm.compute(shape, lambda i: tvm.log(a_buf[i]), 'exp')
    tanh = tvm.compute(shape, lambda i: tvm.tanh(a_buf[i]), 'exp')
    sigmoid = tvm.compute(shape, lambda i: tvm.sigmoid(a_buf[i]), 'exp')

    # k = tvm.reduce_axis((0, 16), 'k0')
    # sum = tvm.compute((1, ), lambda i: tvm.sum(sigmoid[k], axis=k), 'sum')
    # nnpu.utils.MarkScope(sum)

    # softmax = tvm.compute(shape, lambda i: sigmoid[i] / sum[0], 'softmax')
    # nnpu.utils.MarkScope(softmax)
    # softmax_host, _ = nnpu.utils.CopyBufToH(softmax, 'softmax')

    s = nnpu.create_schedule([exp.op, log.op, tanh.op, sigmoid.op])
    # cache write
    exp_buf = s.cache_write(exp, env.get_scope('buffer0'))
    log_buf = s.cache_write(log, env.get_scope('buffer0'))
Пример #18
0
def logsummul(dtype):
    nn = 512
    bb = 32
    n = tvm.var('n')
    n = tvm.convert(nn)
    b = tvm.var('b')
    b = tvm.convert(bb)
    m, l = n, n
    A = tvm.placeholder((b, n, l), name='A', dtype=dtype)
    B = tvm.placeholder((b, m, l), name='B', dtype=dtype)
    k = tvm.reduce_axis((0, l), name='k')
    k2 = tvm.reduce_axis((0, l), name='k2')

    M = tvm.compute(
        (b, m, n),
        lambda bb, ii, jj: tvm.max(A[bb, jj, k] + B[bb, ii, k], axis=k),
        name='M')
    M2 = tvm.compute(
        (b, m, n),
        lambda bb, ii, jj: tvm.sum(
            tvm.exp(A[bb, jj, k2] + B[bb, ii, k2] - M[bb, ii, jj]), axis=k2),
        #lambda bb, ii, jj: tvm.sum(tvm.exp(A[bb, jj, k2] + B[bb, ii, k2]- M[bb, ii, jj]), axis=k2),
        name='M2')

    C = tvm.compute((b, m, n),
                    lambda bb, ii, jj: tvm.log(M2[bb, ii, jj]) + M[bb, ii, jj],
                    name='C')

    s = tvm.create_schedule(C.op)

    AA = s.cache_read(A, "shared", [M])
    AL = s.cache_read(AA, "local", [M])
    BB = s.cache_read(B, "shared", [M])
    BL = s.cache_read(BB, "local", [M])

    AA2 = s.cache_read(A, "shared", [M2])
    AL2 = s.cache_read(AA2, "local", [M2])
    BB2 = s.cache_read(B, "shared", [M2])
    BL2 = s.cache_read(BB2, "local", [M2])

    cfg = autotvm.get_config()
    cfg.define_knob("y_bn", [32, 64, 128])
    cfg.define_knob("x_bn", [32, 64, 128])
    cfg.define_knob("y_t", [8, 32, 64])
    cfg.define_knob("x_t", [2, 4, 8, 32])
    cfg.define_knob("k_split", [1, 2, 8, 16])
    unroll = True

    #cfg.define_knob("y_bn", [64])
    #cfg.define_knob("x_bn", [ 64])
    #cfg.define_knob("y_t", [8])
    #cfg.define_knob("x_t", [8])
    #cfg.define_knob("k_split", [8])

    b, y, x = s[C].op.axis
    y_bn = cfg["y_bn"].val
    x_bn = cfg["x_bn"].val
    by, y = s[C].split(y, y_bn)
    bx, x = s[C].split(x, x_bn)

    y_nthreads = cfg["y_t"].val
    x_nthreads = cfg["x_t"].val
    ty, yi = s[C].split(y, nparts=y_nthreads)
    tx, xi = s[C].split(x, nparts=x_nthreads)
    thread_x = tvm.thread_axis((0, x_nthreads), "threadIdx.x")
    thread_y = tvm.thread_axis((0, y_nthreads), "threadIdx.y")

    s[C].reorder(b, by, bx, ty, tx, yi, xi)
    s[C].bind(b, tvm.thread_axis("blockIdx.z"))
    s[C].bind(by, tvm.thread_axis("blockIdx.y"))
    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    if unroll:
        s[C].pragma(yi, "auto_unroll_max_step", 16)

    def cache_split(shared):
        s[shared].compute_at(s[C], tx)
        _, yi, xi = s[shared].op.axis
        k, = s[shared].op.reduce_axis
        ko, ki = s[shared].split(k, cfg["k_split"].val)
        s[shared].reorder(ko, ki, yi, xi)
        if unroll:
            s[shared].pragma(ki, "auto_unroll_max_step", 16)
        return ko, ki

    ko, ki = cache_split(M)
    ko2, ki2 = cache_split(M2)

    def cache_read(shared, AA, AL, BB, BL, ko, ki):
        s[AA].compute_at(s[shared], ko)
        s[AL].compute_at(s[shared], ki)
        s[BB].compute_at(s[shared], ko)
        s[BL].compute_at(s[shared], ki)

        _, y, k = s[AA].op.axis
        ty, yi = s[AA].split(y, nparts=y_nthreads)
        tx, ki = s[AA].split(k, nparts=x_nthreads)
        s[AA].reorder(ty, tx, yi, ki)
        s[AA].bind(ty, thread_y)
        s[AA].bind(tx, thread_x)
        if unroll:
            s[AA].pragma(yi, "auto_unroll_max_step", 16)

        _, x, k = s[BB].op.axis
        ty, xi = s[BB].split(x, nparts=y_nthreads)
        tx, ki = s[BB].split(k, nparts=x_nthreads)
        s[BB].bind(ty, thread_y)
        s[BB].bind(tx, thread_x)
        s[BB].reorder(ty, tx, xi, ki)
        if unroll:
            s[BB].pragma(xi, "auto_unroll_max_step", 16)

    cache_read(M, AA, AL, BB, BL, ko, ki)
    cache_read(M2, AA2, AL2, BB2, BL2, ko2, ki2)

    return s, [A, B, C]