Exemplo n.º 1
0
def test_rfactor():
    n = tvm.var('n')
    k1 = tvm.reduce_axis((0, n), name="k1")
    k2 = tvm.reduce_axis((0, n), name="k2")
    A = tvm.placeholder((n, n, n), name='A')
    B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k1, k2], axis=[k1, k2]))
    # normal schedule
    s = tvm.create_schedule(B.op)
    BF = s.rfactor(B, k1)
    assert(tuple(BF.shape) == (n, n))
    assert(set(BF.op.body[0].axis) == set([k2]))
    assert(s[B].op.body[0].axis[0].dom.extent == n)
    assert(len(s[B].all_iter_vars) == 2)
    # schedule with splot
    s = tvm.create_schedule(B.op)
    ko, ki = s[B].split(k1, factor=4)
    xo, xi = s[B].split(B.op.axis[0], factor=8)
    BF = s.rfactor(B, ki)
    assert(BF.shape[0].value == 4)
    assert(BF.shape[1] == n)
    assert(BF.op.body[0].axis[0] ==  k2)
    assert(BF.op.body[0].axis[1].var ==  ko.var)
    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
    # schedule with factor_axis
    s = tvm.create_schedule(B.op)
    ko, ki = s[B].split(k1, factor=4)
    xo, xi = s[B].split(B.op.axis[0], factor=8)
    BF = s.rfactor(B, ki, 1)
    assert(n == BF.shape[0])
    assert(BF.shape[1].value == 4)
    assert(BF.op.body[0].axis[0] ==  k2)
    assert(BF.op.body[0].axis[1].var ==  ko.var)
    assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
Exemplo n.º 2
0
def verify_full(shape, dtype, fill_value):
    A = tvm.placeholder(shape, dtype=dtype, name="A")
    B = topi.cpp.full_like(A, fill_value)
    C = topi.cpp.full(shape, dtype, fill_value)
    s1 = tvm.create_schedule([B.op])
    s2 = tvm.create_schedule([C.op])

    def get_ref_data():
        return np.full(shape, fill_value, dtype)
    np_nd = get_ref_data()

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        target = topi.cpp.TEST_create_target(device)
        ctx = tvm.context(device, 0)
        out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
        f = tvm.build(s1, [A, B], device, name="full_like")
        f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)

        f = tvm.build(s2, [C], device, name="full")
        f(out)
        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)

    for device in ["llvm"]:
        check_device(device)
Exemplo n.º 3
0
def test_rpc_module():
    # graph
    n = tvm.convert(1024)
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
    temp = util.tempdir()
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=64)
    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
    # Build the dynamic lib.
    # If we don't want to do metal and only use cpu, just set target to be target
    f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd")
    path_dso1 = temp.relpath("dev_lib.dylib")
    f.export_library(path_dso1, xcode.create_dylib,
                     arch=arch, sdk=sdk)
    xcode.codesign(path_dso1)

    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=64)
    s[B].parallel(xi)
    s[B].pragma(xo, "parallel_launch_point")
    s[B].pragma(xi, "parallel_barrier_when_finish")
    f = tvm.build(s, [A, B], target, name="myadd_cpu")
    path_dso2 = temp.relpath("cpu_lib.dylib")
    f.export_library(path_dso2, xcode.create_dylib,
                     arch=arch, sdk=sdk)
    xcode.codesign(path_dso2)

    # Start RPC test server that contains the compiled library.
    server = xcode.popen_test_rpc(proxy_host, proxy_port, key,
                                  destination=destination,
                                  libs=[path_dso1, path_dso2])

    # connect to the proxy
    remote = rpc.connect(proxy_host, proxy_port, key=key)
    ctx = remote.metal(0)
    f1 = remote.load_module("dev_lib.dylib")
    a_np = np.random.uniform(size=1024).astype(A.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
    cost = time_f(a, b).mean
    print('%g secs/op' % cost)
    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
    # CPU
    ctx = remote.cpu(0)
    f2 = remote.load_module("cpu_lib.dylib")
    a_np = np.random.uniform(size=1024).astype(A.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
    time_f = f2.time_evaluator(f1.entry_name, ctx, number=10)
    cost = time_f(a, b).mean
    print('%g secs/op' % cost)
    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
Exemplo n.º 4
0
def test_add_pipeline():
    nn = 64
    max_threads = 4
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        with ib.for_range(0, (n+1) // 2) as i:
            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    def extern_generator_gpu(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        bx = tvm.thread_axis("blockIdx.x")
        tx = tvm.thread_axis("threadIdx.x")
        ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads)
        ib.scope_attr(tx, "thread_extent", max_threads)
        idx = bx.var * max_threads + tx.var
        with ib.if_scope(ib.likely(idx < n)):
            ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C')
    C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C')
    s_cpu = tvm.create_schedule(C_cpu.op)
    s_gpu = tvm.create_schedule(C_gpu.op)
    print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
    print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))

    def check_target(target):
        if not tvm.module.enabled(target):
            return
        s = s_gpu if target in ['opencl', 'cuda'] else s_cpu
        C = C_gpu if target in ['opencl', 'cuda'] else C_cpu
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
        
    check_target("llvm")
    check_target("opencl")
    check_target("cuda")
Exemplo n.º 5
0
def test_bound_tensor_compute_op():
    def intrin_test():
      m1 = tvm.var("m1")
      n1 = tvm.var("n1")
      a = tvm.placeholder((m1, n1), name='a')
      c = tvm.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c')

      Ab = tvm.decl_buffer(a.shape, name="Abuf", offset_factor=1)
      Cb = tvm.decl_buffer(c.shape, name="Cbuf", offset_factor=1)

      def intrin_func(ins, outs):
        aa = ins[0]
        cc = outs[0]
        def _body():
          ib = tvm.ir_builder.create()
          ib.emit(tvm.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r")))
          return ib.get()
        return _body()
      with tvm.build_config(offset_factor=1):
        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb})

    test_func = intrin_test()
    A = tvm.placeholder((20,20), name='A')
    B = tvm.compute(A.shape, lambda i,j : A[i,j], name='B')
    C = tvm.compute((10, 20), lambda i : test_func(B[i:10, 0:20]), name='C')
    s = tvm.create_schedule(C.op)
    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    assert(bounds[B.op.axis[0]].extent.value == 10)
Exemplo n.º 6
0
def test_scan_group():
    m = tvm.var("m")
    n = tvm.var("n")
    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: x[0, i])

    s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
    s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1)
    s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1)
    res = tvm.scan(s_init, s_update3, s_state, inputs=x)

    s = tvm.create_schedule(res.op)
    assert s[s_update1].group is not None
    assert s[s_update2].group == s[s_update1].group
    # Assign within group, is valid
    s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1])
    # create a new group, for [s_update2 and s_update1]
    g2 = s.create_group(outputs=s_update2, inputs=[s_state, x])
    assert g2.group is not None
    assert g2.group == s[s_update3].group
    assert s[s_update2].group == g2
    assert s[s_update1].group == g2
    g2.compute_at(s[s_update3], s_update3.op.axis[1])
    assert g2.attach_stage == s[s_update3]
    try:
        # compute outside group error.
        s[s_update2].compute_at(s[s_init], s_init.op.axis[0])
        assert False
    except tvm.TVMError:
        pass
Exemplo n.º 7
0
Arquivo: conv2d.py Projeto: bddppq/tvm
def schedule_conv2d_nchw(outs):
    """Schedule for conv2d_nchw for Intel Graphics

    Parameters
    ----------
    outs: Array of Tensor
        The computation graph description of conv2d_nchw
        in the format of an array of tensors.
    Returns
    -------
    s: Schedule
        The computation schedule for conv2d_nchw.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])
    scheduled_ops = []

    def traverse(op):
        """inline all one-to-one-mapping operators except the last stage (output)"""
        if tag.is_broadcast(op.tag):
            if op not in s.outputs:
                s[op].compute_inline()
            for tensor in op.input_tensors:
                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                    traverse(tensor.op)
        if 'conv2d' in op.tag:
            _schedule_cl_spatialpack(s, op)

        scheduled_ops.append(op)

    traverse(outs[0].op)
    return s
Exemplo n.º 8
0
def test_bound_nest_thread():
    m = tvm.var('m')
    A = tvm.placeholder((m), name='A')
    A1 = tvm.compute((m,), lambda i: A[i], name='A1')
    A2 = tvm.compute((m,), lambda i: A1[i] + 2, name='A2')
    A3 = tvm.compute((m,), lambda i: A2[i] + 3, name='A3')

    s = tvm.create_schedule(A3.op)
    s[A2].set_scope("shared")
    s[A1].set_scope("local")

    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")
    bx, tx = s[A3].split(A3.op.axis[0], factor=32)
    s[A3].bind(bx, block_x)
    s[A3].bind(tx, thread_x)
    s[A2].compute_at(s[A3], tx)
    _, xi = s[A2].split(A2.op.axis[0], nparts=1)
    s[A2].bind(xi, thread_x)
    s[A1].compute_at(s[A3], tx)
    s = s.normalize()
    bounds = tvm.schedule.InferBound(s)
    assert(bounds[A1.op.axis[0]].extent.value==1)
    assert(bounds[A2.op.axis[0]].extent.value==32)
    assert(bounds[A3.op.axis[0]].extent == m)
Exemplo n.º 9
0
def schedule_conv2d_nchw_cuda(cfg, outs):
    """TOPI schedule callback of conv2d for cuda gpu

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template

    outs: Array of Tensor
        The computation graph description of conv2d
        in the format of an array of tensors.

    Returns
    -------
    s: Schedule
        The computation schedule for conv2d.
    """
    target = tvm.target.current_target()
    if 'cudnn' in target.libs:
        return generic.schedule_extern(outs)

    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == 'conv2d_nchw':
            schedule_direct_cuda(cfg, s, op.output(0))
        if op.tag == 'conv2d_nchw_winograd':
            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
        if op.tag == "conv2d_NCHWc_int8":
            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0))

    traverse_inline(s, outs[0].op, _callback)
    return s
Exemplo n.º 10
0
def test_min_repeat_ms():
    tmp = tempdir()
    filename = tmp.relpath("log")

    @tvm.register_func
    def my_debug(filename):
        """one call lasts for 100 ms and writes one character to a file"""
        time.sleep(0.1)
        with open(filename, "a") as fout:
            fout.write("c")

    X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename))
    s = tvm.create_schedule(X.op)
    func = tvm.build(s, [X])

    x = tvm.nd.empty((), dtype="int32")
    ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
                                 number=1, repeat=1)
    ftimer(x)

    with open(filename, "r") as fin:
        ct = len(fin.readline())

    assert ct == 2


    ftimer = func.time_evaluator(func.entry_name, tvm.cpu(),
                                 number=1, repeat=1, min_repeat_ms=1000)
    ftimer(x)

    # make sure we get more than 10 calls
    with open(filename, "r") as fin:
        ct = len(fin.readline())

    assert ct > 10 + 2
Exemplo n.º 11
0
def verify_log_softmax(m, n):
    A = tvm.placeholder((m, n), name='A')
    B = topi.nn.log_softmax(A)
    # confirm lower works
    s = tvm.create_schedule([B.op])
    tvm.lower(s, [A, B], simple_mode=True)
    a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = topi.testing.log_softmax_python(a_np)

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_softmax(B)
        ctx = tvm.context(device, 0)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
        foo = tvm.build(s, [A, B], device, name="log_softmax")
        foo(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

    for device in ["opengl"]:
        check_device(device)
Exemplo n.º 12
0
def test_matmul_add():
    n = 1024
    l = 128
    m = 235
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((l, m), name='B')
    C = rocblas.matmul(A, B)
    s = tvm.create_schedule(C.op)

    def verify(target="rocm"):
        if not tvm.module.enabled(target):
            print("skip because %s is not enabled..." % target)
            return
        if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
            print("skip because extern function is not available")
            return
        ctx = tvm.rocm(0)
        f = tvm.build(s, [A, B, C], target)
        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        f(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
    verify()
Exemplo n.º 13
0
Arquivo: nn.py Projeto: LANHUIYING/tvm
def _schedule_conv2d(outs):
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])
    tvm.schedule.AutoInlineInjective(s)

    def traverse(OP):
        """Internal travserse function"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_injective(OP.tag):
            if OP not in s.outputs:
                s[OP].compute_inline()
            for tensor in OP.input_tensors:
                if tensor.op.input_tensors:
                    traverse(tensor.op)
        # schedule conv2d
        elif OP.tag.find("conv2d") >= 0:
            Conv2d = OP.output(0)
            if not Conv2d.op in s.outputs:
                Out = outs[0].op.output(0)
                s[Conv2d].compute_at(s[Out], s[Out].op.axis[1])
        else:
            raise RuntimeError("Unsupported operator: %s" % OP.tag)

    traverse(outs[0].op)

    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
    return s
Exemplo n.º 14
0
def test_add():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)

    def check_c():
        mhost = tvm.build(s, [A, B, C], "c", name="fadd")
        temp = util.tempdir()
        path_dso = temp.relpath("temp.so")
        mhost.export_library(path_dso)
        m = tvm.module.load(path_dso)
        fadd = m['fadd']
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_c()
Exemplo n.º 15
0
def test_double_splitting_with_indivisible_factors():
    m = 48
    dtype="float32"
    A = tvm.placeholder((m,), name='A', dtype=dtype)
    C = tvm.compute((m,), lambda i: A[i], name='C')
    D = tvm.compute((m,), lambda i: C[i], name='D')

    s = tvm.create_schedule(D.op)
    co, ci = s[C].split(C.op.axis[0], factor=10)
    do, di = s[D].split(D.op.axis[0], 32)
    s[C].compute_at(s[D], do)

    target = 'llvm'
    with tvm.build_config(partition_const_loop=True):
        f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
        func = tvm.build(f, target=target)

    # Find the beginning of the Halide IR corresponding to kernel code
    # and make sure it doesn't have an if statements left
    top_produce = find_top_produce(f.body)
    assert(not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.stmt.IfThenElse))))

    # check functional correctness of generated code
    ctx = tvm.context(target, 0)
    a = tvm.nd.array(numpy.ones(m,).astype(dtype), ctx)
    c = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    d = tvm.nd.array(numpy.zeros(m,).astype(dtype), ctx)
    func(a, c, d)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5)
    tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
Exemplo n.º 16
0
Arquivo: nn.py Projeto: LANHUIYING/tvm
def schedule_softmax(outs):
    """Schedule for softmax

    Parameters
    ----------
    outs: Array of Tensor
          The computation graph description of softmax
          in the format of an array of tensors.

    Returns
    -------
    sch: Schedule
        The computation schedule for the op.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])
    tvm.schedule.AutoInlineInjective(s)

    softmax = outs[0]
    max_elem = softmax.op.input_tensors[1]
    expsum = softmax.op.input_tensors[2]

    s[expsum].compute_at(s[softmax], s[softmax].op.axis[1])
    s[max_elem].compute_at(s[softmax], s[softmax].op.axis[1])

    px, x = s[softmax].split(softmax.op.axis[0], nparts=1)
    s[softmax].bind(px, tvm.thread_axis("pipeline"))
    return s
Exemplo n.º 17
0
def dump_graph_lib(target_dir):
    dim = 4
    A = tvm.placeholder((dim,), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
    sched = tvm.create_schedule(B.op)

    node0 = {"op": "null", "name": "x", "inputs": []}
    node1 = {"op": "tvm_op", "name": "add",
             "inputs": [[0, 0, 0]],
             "attrs": {"func_name": "myadd",
                       "flatten_data": "1",
                       "num_inputs" : "1",
                    "num_outputs" : "1"}}
    nodes = [node0, node1]
    arg_nodes = [0]
    node_row_ptr = [0, 1, 2]
    outputs = [[1, 0, 0]]
    shape = (4,)
    attrs = {
        "shape" : ["list_shape", [shape, shape]],
        "dltype" : ["list_str", ["float32", "float32"]],
        "storage_id" : ["list_int", [0, 1]],
    }
    graph = {"nodes": nodes,
             "arg_nodes": arg_nodes,
             "node_row_ptr": node_row_ptr,
             "heads": outputs,
             "attrs": attrs}

    graph = json.dumps(graph)
    mlib = tvm.build(sched, [A, B], "llvm", name="myadd")

    mlib.export_library(os.path.join(target_dir, "graph_addone_lib.so"))
    with open(os.path.join(target_dir, "graph_addone.json"), "w") as fo:
        fo.write(graph)
Exemplo n.º 18
0
def test_conv_tiling():
    HSTR = WSTR = 1
    in_channel = 128
    kernel_height = kernel_width = 3
    out_channel = 64
    batch_size = 1
    in_height = in_width = 64
    out_height = out_width = in_height - kernel_height + 1
    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
        out_channel), name='kernel')
    ic = tvm.reduce_axis((0, in_channel), name='ic')
    kh = tvm.reduce_axis((0, kernel_height), name='kh')
    kw = tvm.reduce_axis((0, kernel_width), name='kw')
    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                     kernel[kh, kw, ic, oc],
                                                     axis=[ic, kh, kw]),
                       name="conv2d")
    s = tvm.create_schedule(conv.op)

    n, oc, oh, ow = conv.op.axis
    oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    stmt = tvm.ir_pass.LoopPartition(stmt, True)
    stmt = tvm.ir_pass.Simplify(stmt)
    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
Exemplo n.º 19
0
def verify_clip(N, a_min, a_max, dtype):
    A = tvm.placeholder((N, N), dtype=dtype, name='A')
    B = topi.clip(A, a_min, a_max)
    s = tvm.create_schedule([B.op])

    # use memoize to pickle the test data for next time use
    @memoize("topi.tests.test_topi_clip")
    def get_ref_data():
        a_np = np.random.uniform(a_min*2, a_max*2, size=(N, N)).astype(dtype)
        b_np = np.clip(a_np, a_min, a_max)
        return a_np, b_np
    a_np, b_np = get_ref_data()

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.generic.schedule_injective(B)

        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
        f = tvm.build(s, [A, B], device, name="clip")
        f(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)

    for device in get_all_backend():
        check_device(device)
Exemplo n.º 20
0
def intrin_vadd(n, cache_read=False, cache_write=False):
    scope_ubuf = 'local'
    dtype = 'float32'
    x = tvm.placeholder((n,), dtype=dtype, name='vx')
    y = tvm.placeholder((n,), dtype=dtype, name='vy')
    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
    s = tvm.create_schedule(z.op)

    def create_buffer(t):
        return tvm.decl_buffer(t.shape, t.dtype,
                               name='W'+t.name,
                               scope=scope_ubuf,
                               offset_factor=16)

    binds = {}
    if cache_read:
        binds[x] = create_buffer(x)
        binds[y] = create_buffer(y)
    if cache_write:
        binds[z] = create_buffer(z)

    def intrin_func(ins, outs):
        ib = tvm.ir_builder.create()
        ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
        return ib.get()

    with tvm.build_config(offset_factor=16):
        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
Exemplo n.º 21
0
def test_llvm_persist_parallel():
    n = 128
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
    C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=8)
    xo1, xo2 = s[C].split(xo, nparts=1)
    s[B].compute_at(s[C], xo1)
    s[B].parallel(s[B].op.axis[0])
    s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish")
    s[C].parallel(xi)
    s[C].pragma(xo1, "parallel_launch_point")
    s[C].pragma(xi, "parallel_stride_pattern")

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(),
                                   np.sqrt(a.asnumpy() + 1) * 2 + 2,
                                   rtol=1e-5)

    check_llvm()
Exemplo n.º 22
0
def test_multiple_func():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    s[C].parallel(xo)
    s[C].vectorize(xi)
    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # build two functions
        f2 = tvm.lower(s, [A, B, C], name="fadd1")
        f1 = tvm.lower(s, [A, B, C], name="fadd2")
        m = tvm.build([f1, f2], "llvm")
        fadd1 = m['fadd1']
        fadd2 = m['fadd2']
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd1(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())
        fadd2(a, b, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_llvm()
Exemplo n.º 23
0
def test_dynamic_tensor():
    dtype = 'float32'
    stype = 'csr'
    target = 'llvm'
    ctx = tvm.context(target, 0)
    nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n')
    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
    assert(A.stype == 'csr')
    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
    s = tvm.create_schedule(C.op)
    _nr, _nc = 3, 5
    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
    a = tvmsp.array(a, ctx)
    assert a.data.dtype == a.dtype
    Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
    binds = {A.data: Ab.data, A.indices: Ab.indices}
    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
    c.data = tvm.nd.empty(a.data.shape, dtype)
    c.indices = a.indices
    c.indptr = a.indptr
    f(a.data.shape[0], a.data, c.data)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
Exemplo n.º 24
0
def test_sort_np():
    dshape = (1, 2, 3, 4, 5, 6)
    axis = 4
    reduced_shape = (1, 2, 3, 4, 6)
    is_descend = False
    data = tvm.placeholder(dshape, name='data')
    sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32")
    out = tvm.extern(data.shape, [data, sort_num],
                     lambda ins, outs: tvm.call_packed(
                         "tvm.contrib.sort.argsort", ins[0],
                         ins[1], outs[0], axis, is_descend),
                     dtype='int32', name="sort_tensor")

    ctx = tvm.cpu(0)
    target = "llvm"
    s = tvm.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)

    np_data = np.random.uniform(size=dshape)
    np_out = np.argsort(np_data, axis=axis)
    sort_num_input = np.full(reduced_shape, dshape[axis])
    a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
Exemplo n.º 25
0
def test_add_pipeline():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        with ib.for_range(0, n/2) as i:
            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    C = tvm.extern(A.shape, [A], extern_generator, name='C')
    s = tvm.create_schedule(C.op)
    print(tvm.lower(s, [A, C], simple_mode=True))

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        np.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + 1)
    check_llvm()
Exemplo n.º 26
0
def test_schedule_create():
    m = tvm.var('m')
    n = tvm.var('n')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    B = tvm.placeholder((n, l), name='B')
    AA = tvm.compute((m, l), lambda i, j: A[i, j])
    T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
    s = tvm.create_schedule(T.op)
    s[AA].set_scope("shared")
    xo, xi = s[T].split(T.op.axis[0], factor=10)
    xi1, xi2 = s[T].split(xi, factor=2)
    s[AA].compute_at(s[T], xi1)
    xo, xi = s[AA].split(AA.op.axis[0], factor=10)
    s[T].reorder(xi2, xi1)
    assert T.op.axis[1] in s[T].leaf_iter_vars

    # save load json
    json_str = tvm.save_json(s)
    s_loaded = tvm.load_json(json_str)
    assert isinstance(s_loaded, tvm.schedule.Schedule)
    assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))

    # pickle unpickle
    dump = pkl.dumps(s)
    s_loaded = pkl.loads(dump)
    assert isinstance(s_loaded, tvm.schedule.Schedule)
    assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
Exemplo n.º 27
0
def test_sort():
    n = 2
    l = 5
    m = 3
    data = tvm.placeholder((n, l, m), name='data')
    sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32")
    axis = 1
    is_descend = True
    out = tvm.extern(data.shape, [data, sort_num],
                     lambda ins, outs: tvm.call_packed(
                         "tvm.contrib.sort.argsort", ins[0],
                         ins[1], outs[0], axis, is_descend),
                     dtype='int32', name="sort_tensor")
    input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]],
             [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]]
    sort_num_input = [[1, 2, 3], [4, 5, 5]]
    sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]],
                    [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]]

    ctx = tvm.cpu(0)
    target = "llvm"
    s = tvm.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)
    a = tvm.nd.array(np.array(input).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
Exemplo n.º 28
0
def test_lstm_cell_inline():
    num_step = 128
    num_input = 256
    num_hidden = 1152
    batch_size = 4
    # Global transition matrix
    X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X")
    Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h")
    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
    # h: output hidden state, c: cell state.
    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
    s_init_c = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_c")
    s_init_h = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0, name="init_h")
    # LSTM transition
    k = tvm.reduce_axis((0, num_input), name="ki2h")
    s_i2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
        name="s_i2h")
    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
    s_h2h = tvm.compute(
        (num_step, 4, batch_size, num_hidden),
        lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
        name="s_h2h")
    # Gate rules
    gates = tvm.compute(s_i2h.shape, lambda *i:
                        s_i2h(*i) + s_h2h(*i), name="gates")
    gshape = (num_step, batch_size, num_hidden)
    in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate")
    in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform")
    forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate")
    out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate")
    next_c = tvm.compute(gshape,
                         lambda t, i, j:
                         forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
                         in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
    next_h = tvm.compute(gshape,
                         lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
    update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
    update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
    # schedule
    scan_h, scan_c = tvm.scan(
        [s_init_h, s_init_c],
        [update_h, update_c],
        [s_state_h, s_state_c],
        inputs=[X],
        name="lstm_scan")
    # schedule
    s = tvm.create_schedule(scan_h.op)
    # Inline gate computations
    s[gates].compute_inline()
    s[in_gate].compute_inline()
    s[in_transform].compute_inline()
    s[forget_gate].compute_inline()
    s[out_gate].compute_inline()
    # verify we can lower correctly
    tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
Exemplo n.º 29
0
def test_pack_buffer_intermediate():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute((n,), lambda i: A[i] + 1, name="B")
    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline."""
        return tvm.call_packed("my_extern_array_func2", ins[0], outs[0])

    C = tvm.extern(B.shape, [B], extern_generator, name='C')
    s = tvm.create_schedule(C.op)

    def check_target(target):
        if not tvm.module.enabled(target):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)

        @tvm.register_func
        def my_extern_array_func2(aa, bb):
            assert aa.shape == a.shape
            tvm.testing.assert_allclose(
                aa.asnumpy(), a.asnumpy() + 1)
            aa.copyto(bb)

        f(a, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + 1)

    check_target("llvm")
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"):
    A = tvm.placeholder(shape, dtype=dtype, name="A")
    C = topi.sum(A, axis=0, keepdims=False)

    s = tvm.create_schedule(C.op)
    f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name)
    return f

def visit_call(node, ret):
    ret.append(node)
    print(type(node), " dtype=", node.dtype)
    print(type(node), " name=", node.name)
    for arg in node.args:
        visit(arg, ret)
    print(type(node), " call_type=", node.call_type)
    print(type(node), " func=", node.func)
    print(type(node), " value_index=", node.value_index)


def visit_let(node, ret):
    ret.append(node)
    visit(node.var, ret)
    visit(node.value, ret)
    visit(node.body, ret)


if __name__ == "__main__":
    from auto_schedule.examples import FUNC_TABLE
    func = FUNC_TABLE["conv3d_channel_batch"].func
    args = FUNC_TABLE["conv3d_channel_batch"].args
    op, bufs = func(*args)
    s = tvm.create_schedule(op)
    stmt = tvm.lower(s, bufs, simple_mode=True)
    print(stmt)
    ret = []
    visit(stmt, ret)
Exemplo n.º 32
0
def schedule_dense(cfg, outs):
    """Schedule for dense operator.

    Parameters
    ----------
    cfg: ConfigEntity
        The config entity for this template
    outs: Array of Tensor
        The computation graph description of dense
        in the format of an array of tensors.

    Returns
    -------
    s: Schedule
        The computation schedule for dense.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == 'dense':
            vec_size = [1, 2, 4, 8, 16]
            max_unroll = 32

            dense = op.output(0)
            output = outs[0]

            y, x = s[output].op.axis
            c = s[dense].op.reduce_axis[0]

            ##### space definition begin #####
            cfg.define_split('tile_y', y, num_outputs=3)
            cfg.define_split('tile_x', x, num_outputs=3)
            cfg.define_split('c_unroll', c, num_outputs=2, max_factor=64)

            # fallback support
            if cfg.is_fallback:
                ref_log = autotvm.tophub.load_reference_log(
                    'mali', 'rk3399', 'dense', 'direct')
                cfg.fallback_with_reference_log(ref_log)
            ##### space definition end #####

            if dense.op in s.outputs:
                dense = s.cache_write(output, 'local')

            by, ty, yi = cfg['tile_y'].apply(s, output, y)
            bx, tx, xi = cfg['tile_x'].apply(s, output, x)

            s[output].bind(by, tvm.thread_axis('blockIdx.y'))
            s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
            s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
            s[output].bind(tx, tvm.thread_axis('threadIdx.x'))

            if cfg['tile_y'].size[-1] < max_unroll:
                s[output].unroll(yi)
            if cfg['tile_x'].size[-1] in vec_size:
                s[output].vectorize(xi)
            s[dense].compute_at(s[output], tx)

            k = s[dense].op.reduce_axis[0]
            y, x = s[dense].op.axis
            k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
            s[dense].reorder(k, k_unroll, y, x)
            s[dense].unroll(k_unroll)
            if cfg['tile_y'].size[-1] < max_unroll:
                s[dense].unroll(y)
            if cfg['tile_x'].size[-1] in vec_size:
                s[dense].vectorize(x)

    traverse_inline(s, outs[0].op, _callback)
    return s
Exemplo n.º 33
0
Arquivo: peak.py Projeto: zheng-da/tvm
def measure_bandwidth_sum(total_item, item_per_thread, stride,
                          base_type, bits, lanes,
                          target, target_host, remote, ctx, n_times):
    """ measure memory bandwidth of gpu by product reduction for a given type

    The IR for measurement is

    for each thread
        for i in 1..num_per_thread:
            y[global_id] = y[global_id] * x[base + i * stride]

    Parameters
    ----------
    total_item: int
        number of elements in input array
    item_per_thread: int
        number of elements each thread accumulates
    stride: int
        stride in memory access
    base_type: str
        can be "int", "float"
    bits: int
        can be 16, 32
    lanes: int
       lane of the vector type, can be 1, 2, 4, 8, 16
    target: :any:`tvm.target.Target`
        the target and option of the compilation.
    target_host : str or :any:`tvm.target.Target`
        host compilation target
    ctx: TVMcontext
        the context of array
    remote: tvm.contrib.rpc.RPCSession
        remote rpc session
    n_times: int
        number of runs for taking mean

    Returns
    -------
    GBPS: float
         gigabyte per second
    """
    n, m = total_item, item_per_thread
    n //= lanes

    base_type = str(base_type) + str(bits)
    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)

    k = tvm.reduce_axis((0, m), name="k")

    x = tvm.placeholder((n,), dtype=dtype, name="x")
    op = tvm.comm_reducer(lambda x, y: x*y, lambda t: tvm.const(1, dtype=t), name="sum")
    y = tvm.compute((n // m,),
                    lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k))
    s = tvm.create_schedule(y.op)

    yo, yi = s[y].split(y.op.axis[0], target.max_num_threads)
    s[y].bind(yo, tvm.thread_axis("blockIdx.x"))
    s[y].bind(yi, tvm.thread_axis("threadIdx.x"))
    s[y].unroll(k)

    try:
        func = tvm.build(s, [x, y], target, target_host=target_host)

        x = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
        y = tvm.nd.empty((n // m,), dtype=dtype, ctx=ctx)

        func = _convert_to_remote(func, remote)
        time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
        time = time_f(x, y).mean
    except tvm._ffi.base.TVMError:
        # build error (occur when device does not support half)
        return -1

    return 1.0 * (total_item * bits / 8) / 1e9 / time
Exemplo n.º 34
0
def train_op_schedule_cpu_general_dx(entities,
                                     epoch,
                                     batch_size,
                                     path,
                                     loop_num=100,
                                     loop_size=16,
                                     stack_size=20,
                                     logfile="temp.log",
                                     device="cuda:0"):
    dim = 5
    timeout = 15.0
    num_sample = len(entities)
    device = torch.device(device)
    model = OpScheduleCPUd5(3, 128, device)
    # load or initialize parameter file
    if os.path.exists(path) and os.path.isfile(path):
        state_dict = torch.load(path)
        model.load_state_dict(state_dict)
    else:
        torch.save(model.state_dict(), path)
    model.to(device)
    optimizer = torch.optim.Adadelta(model.parameters(), lr=LR)
    model.train()
    # maintain a dataset for each function
    datasets = [[] for i in range(num_sample)]

    train_beg_time = time.time()
    with open(logfile, "a") as f:
        f.write("New log\ntime: {}".format(train_beg_time))
    perf_before = dict()
    perf_before_dump = False
    model.train()
    print("Scheduling begins...parameters in path {}\n    logs to{}".format(
        path, logfile))
    for i in range(epoch):
        optimizer.zero_grad()
        for batch in range(batch_size):
            for p in range(num_sample):
                func_name = entities[p].func_name
                func = FUNC_TABLE[func_name].func
                args = entities[p].args
                ops, bufs = func(*args)
                s = tvm.create_schedule(ops)
                # get the performance before scheduling
                # only run one time
                entity_key = "{}:{}".format(func_name, args)
                if entity_key not in perf_before:
                    pre_cost = serial_evaluate(s,
                                               bufs,
                                               "llvm",
                                               np.random.randint(0, MAX_CPU),
                                               10,
                                               timeout=timeout)
                    perf_before[entity_key] = pre_cost
                if not isinstance(ops, (list, tuple)):
                    ops = [ops]
                bfs_order, down_graph = graph_analysis(ops)
                group_points = []
                for op in bfs_order:
                    if not isinstance(op, tvm.tensor.ComputeOp):
                        continue
                    if able_inline(op, down_graph):
                        s[op].compute_inline()
                    else:
                        group_points.append(op)
                if len(group_points) > 1:
                    raise RuntimeError("Not support more than one compute")
                for j, point in enumerate(group_points):
                    y_dict, y_diary = op_schedule_cpu_general_dx(
                        dim,
                        s,
                        point,
                        model,
                        random=np.random.random() < 0.2,
                        sampling=True)
                    post_cost = serial_evaluate(s,
                                                bufs,
                                                "llvm",
                                                np.random.randint(0, MAX_CPU),
                                                10,
                                                timeout=timeout)
                    data = dict()
                    for name, value in y_dict.items():
                        if isinstance(value, list):
                            tmp = []
                            for v in value:
                                tmp.append(v.detach())
                            data[name] = (
                                tmp, y_diary[name]
                            )  # the data record schedule decisions
                        else:
                            data[name] = (value.detach(), y_diary[name])
                        # record  (point No. , sch data, time cost)
                        datasets[p].append((j, data, post_cost))
        # record performance before scheduling
        # only run one time
        if not perf_before_dump:
            with open(logfile, "a") as f:
                logs = "performance before scheduling:\n"
                f.write(logs)
                for key, perf in perf_before.items():
                    logs = "{}: {}\n".format(key, perf)
                    f.write(logs)
                f.write("\n")
            perf_before_dump = True
        # control the size of dataset and record best cases
        cur_time = time.time()
        with open(logfile, "a") as f:
            for j in range(num_sample):
                datasets[j] = heapq.nsmallest(stack_size,
                                              datasets[j],
                                              key=lambda x: x[-1])
                entity_key = "{}:{}".format(entities[j].func_name,
                                            entities[j].args)
                duration = cur_time - train_beg_time
                logs = "epoch {}/{}| {} best perf {}| [{}s]\n".format(
                    i + 1, epoch, entity_key, datasets[j][0][-1], duration)
                f.write(logs)
                logs = "schedule {}\n".format(entity_key)
                for name, val in datasets[j][0][1].items(
                ):  # find the diary, this is ugly now, change later
                    logs = logs + "{}: {}\n".format(name, val[1])
                logs = logs + "\n"
                f.write(logs)
        # train the parameters
        for r in range(loop_num):
            acc_loss = 0.0
            for inner in range(loop_size):
                for q in range(num_sample):
                    func_name = entities[q].func_name
                    func = FUNC_TABLE[func_name].func
                    args = entities[q].args
                    for (point_num, data, time_cost) in datasets[q][:1]:
                        ops, bufs = func(*args)
                        s = tvm.create_schedule(ops)
                        if not isinstance(ops, (list, tuple)):
                            ops = [ops]
                        bfs_order, down_graph = graph_analysis(ops)
                        group_points = []
                        for op in bfs_order:
                            if not isinstance(op, tvm.tensor.ComputeOp):
                                continue
                            if able_inline(op, down_graph):
                                s[op].compute_inline()
                            else:
                                group_points.append(op)
                        y_dict, _ = op_schedule_cpu_general_dx(
                            dim,
                            s,
                            group_points[point_num],
                            model,
                            random=False,
                            sampling=False)
                        # spatial loss
                        spatial_loss = 0.0
                        for j in range(dim):
                            spatial_loss = spatial_loss + torch.nn.functional\
                                .binary_cross_entropy(y_dict["spatial"][j], data["spatial"][0][j])
                        # reduce_loss
                        reduce_loss = 0.0
                        for j in range(dim):
                            reduce_loss = reduce_loss + torch.nn.functional\
                                .binary_cross_entropy(y_dict["reduce"][j], data["reduce"][0][j])
                        # parallel_loss
                        parallel_loss = torch.nn.functional\
                            .binary_cross_entropy(y_dict["parallel"], data["parallel"][0])
                        # reorder_one loss
                        reorder_one_loss = torch.nn.functional\
                            .binary_cross_entropy(y_dict["reorder_one"], data["reorder_one"][0])
                        # reorder_two loss
                        reorder_two_loss = torch.nn.functional\
                            .binary_cross_entropy(y_dict["reorder_two"], data["reorder_two"][0])
                        # reorder_three loss
                        reorder_three_loss = torch.nn.functional\
                            .binary_cross_entropy(y_dict["reorder_three"], data["reorder_three"][0])
                        # accumulate loss
                        acc_loss = acc_loss + spatial_loss + reduce_loss + parallel_loss + reorder_one_loss \
                                   + reorder_two_loss + reorder_three_loss
            acc_loss.backward()
            if r % 10 == 0:
                torch.save(model.state_dict(), path)
                logs = "epoch={}, r={}, loss={}\n".format(
                    i + 1, r, float(acc_loss.detach()))
                with open(logfile, "a") as f:
                    f.write(logs)
            optimizer.step()
        with open(logfile, "a") as f:
            f.write("\n")
    print("All done.")
Exemplo n.º 35
0
def schedule_conv2d_hwcn(outs):
    """Schedule for conv2d_hwcn and any element-wise operations.

    Parameters
    ----------
    outs: Array of Tensor
        The computation graph description of conv2d_hwcn in the format
        of an array of tensors.

    Returns
    -------
    s: Schedule
        The computation schedule for conv2d_hwcn.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    sch = tvm.create_schedule([x.op for x in outs])

    def schedule(Apad, W, B):
        """Schedule conv2d_hwcn"""
        sch[Apad].compute_inline()
        AA = sch.cache_read(Apad, "shared", [B])
        WW = sch.cache_read(W, "shared", [B])
        AL = sch.cache_read(AA, "local", [B])
        WL = sch.cache_read(WW, "local", [B])

        if B.op in sch.outputs:
            Out = B
            BL = sch.cache_write(Out, "local")
        else:
            Out = sch.outputs[0].output(0)
            sch[B].set_scope("local")
            BL = B

        tile = 8
        num_thread = 8
        block_factor = tile * num_thread
        step = 8
        vthread = 2

        block_x = tvm.thread_axis("blockIdx.x")
        block_y = tvm.thread_axis("blockIdx.y")
        block_z = tvm.thread_axis("blockIdx.z")
        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
        thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
        thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")

        hi, wi, fi, ni = sch[Out].op.axis
        bz = sch[Out].fuse(hi, wi)
        by, fi = sch[Out].split(fi, factor=block_factor)
        bx, ni = sch[Out].split(ni, factor=block_factor)
        tyz, fi = sch[Out].split(fi, nparts=vthread)
        txz, ni = sch[Out].split(ni, nparts=vthread)
        ty, fi = sch[Out].split(fi, nparts=num_thread)
        tx, ni = sch[Out].split(ni, nparts=num_thread)
        sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)
        sch[Out].bind(bz, block_z)
        sch[Out].bind(by, block_y)
        sch[Out].bind(bx, block_x)
        sch[Out].bind(tyz, thread_yz)
        sch[Out].bind(txz, thread_xz)
        sch[Out].bind(ty, thread_y)
        sch[Out].bind(tx, thread_x)

        # Schedule BL local write
        sch[BL].compute_at(sch[Out], tx)
        yi, xi, fi, ni = sch[BL].op.axis
        ry, rx, rc = sch[BL].op.reduce_axis
        rco, rci = sch[BL].split(rc, factor=step)
        sch[BL].reorder(rco, ry, rx, rci, fi, ni)
        fuse_index = sch[BL].fuse(ry, rx)
        fuse_index = sch[BL].fuse(fuse_index, rco)
        rx = fuse_index

        sch[AA].compute_at(sch[BL], rx)
        sch[WW].compute_at(sch[BL], rx)
        sch[AL].compute_at(sch[BL], rci)
        sch[WL].compute_at(sch[BL], rci)
        # Schedule for A's shared memory load
        yi, xi, ci, ni = sch[AA].op.axis
        ty, ci = sch[AA].split(ci, nparts=num_thread)
        tx, ni = sch[AA].split(ni, nparts=num_thread)
        _, ni = sch[AA].split(ni, factor=4)
        sch[AA].reorder(ty, tx, yi, xi, ci, ni)
        sch[AA].bind(ty, thread_y)
        sch[AA].bind(tx, thread_x)
        sch[AA].vectorize(ni)
        # Schedule for W's shared memory load
        yi, xi, ci, fi = sch[WW].op.axis
        ty, ci = sch[WW].split(ci, nparts=num_thread)
        tx, fi = sch[WW].split(fi, nparts=num_thread)
        _, fi = sch[WW].split(fi, factor=4)
        sch[WW].reorder(ty, tx, yi, xi, ci, fi)
        sch[WW].bind(ty, thread_y)
        sch[WW].bind(tx, thread_x)
        sch[WW].vectorize(fi)

    def traverse(operator):
        """Traverse operators from computation graph"""
        if operator.tag == 'ewise' or operator.tag == 'scale_shift':
            if operator not in sch.outputs:
                sch[operator].compute_inline()
            for tensor in operator.input_tensors:
                if tensor.op.input_tensors:
                    traverse(tensor.op)
        elif operator.tag == 'conv2d_hwcn':
            Apad = operator.input_tensors[0]
            W = operator.input_tensors[1]
            B = operator.output(0)
            schedule(Apad, W, B)
        else:
            raise RuntimeError("Unsupported operator: %s" % operator.tag)

    traverse(outs[0].op)
    return sch
Exemplo n.º 36
0
def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
    assert N == 1, "Only consider batch_size = 1 in this template"

    data = tvm.placeholder((N, CI, H, W), name='data')
    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, 'float32')
    s = tvm.create_schedule([conv.op])

    # inline padding
    pad_data = s[conv].op.input_tensors[0]
    s[pad_data].compute_inline()
    data, raw_data = pad_data, data

    output = conv
    OL = s.cache_write(conv, 'local')

    # create cache stage
    AA = s.cache_read(data, 'shared', [OL])
    WW = s.cache_read(kernel, 'shared', [OL])
    AL = s.cache_read(AA, 'local', [OL])
    WL = s.cache_read(WW, 'local', [OL])

    # tile and bind spatial axes
    n, f, y, x = s[output].op.axis
    cfg = autotvm.get_config()
    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
    kernel_scope = n  # this is the scope to attach global config inside this kernel

    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
    s[output].bind(vf, tvm.thread_axis("vthread"))
    s[output].bind(vy, tvm.thread_axis("vthread"))
    s[output].bind(vx, tvm.thread_axis("vthread"))
    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
    s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
    s[OL].compute_at(s[output], tx)

    # tile and bind reduction axes
    n, f, y, x = s[OL].op.axis
    rc, ry, rx = s[OL].op.reduce_axis
    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
    rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
    ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
    rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
    s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)

    s[AA].compute_at(s[OL], rxo)
    s[WW].compute_at(s[OL], rxo)
    s[AL].compute_at(s[OL], rxm)
    s[WL].compute_at(s[OL], rxm)

    # cooperative fetching
    for load in [AA, WW]:
        n, f, y, x = s[load].op.axis
        fused = s[load].fuse(n, f, y, x)
        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))

    # tune unroll
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    cfg.define_knob("unroll_explicit", [0, 1])
    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)

    return s, [raw_data, kernel, conv]
Exemplo n.º 37
0
def gemm_int8(n, m, l):
    cfg = autotvm.get_config()

    A = tvm.placeholder((n, l), name='A', dtype='int8')
    B = tvm.placeholder((m, l), name='B', dtype='int8')

    cfg.define_split('tile_y', cfg.axis(m), num_outputs=3)
    cfg.define_split('tile_x', cfg.axis(m), num_outputs=3)

    y_chunk = cfg['tile_y'].size[0]
    y_block = functools.reduce(operator.mul, cfg['tile_y'].size[1:])
    x_chunk = cfg['tile_x'].size[0]
    x_block = functools.reduce(operator.mul, cfg['tile_x'].size[1:])
    k_chunk = l // 16
    k_block = 16

    A_packed = tvm.compute(
        (y_chunk, k_chunk, y_block, k_block),
        lambda yo, ko, yi, ki: A[yo * y_block + yi, ko * k_block + ki],
        name='A_packed')
    B_packed = tvm.compute(
        (x_chunk, k_chunk, x_block, k_block),
        lambda xo, ko, xi, ki: B[xo * x_block + xi, ko * k_block + ki],
        name='B_packed')

    ko = tvm.reduce_axis((0, k_chunk))
    ki = tvm.reduce_axis((0, k_block))

    C = tvm.compute(
        (n, m),
        lambda i, j: tvm.sum(A_packed[
            i // y_block, ko, i % y_block, ki].astype('int32') * B_packed[
                j // x_block, ko, j % x_block, ki].astype('int32'),
                             axis=[ko, ki]),
        name='C')

    s = tvm.create_schedule([t.op for t in [A_packed, B_packed, C]])

    block_x = tvm.thread_axis('blockIdx.x')
    block_y = tvm.thread_axis('blockIdx.y')
    thread_x = tvm.thread_axis('threadIdx.x')
    thread_y = tvm.thread_axis('threadIdx.y')

    s[A_packed].compute_inline()
    s[B_packed].compute_inline()

    AA = s.cache_read(A_packed, 'shared', [C])
    BB = s.cache_read(B_packed, 'shared', [C])
    AL = s.cache_read(AA, 'local', [C])
    BL = s.cache_read(BB, 'local', [C])
    CC = s.cache_write(C, 'local')

    ko, ki = CC.op.reduce_axis
    cfg.define_split('tile_k', cfg.axis(ko), num_outputs=2)
    ko, kmo = cfg['tile_k'].apply(s, CC, ko)
    kmi, ki = s[CC].split(ki, factor=4)

    y, x = CC.op.axis
    s[CC].reorder(ko, kmo, kmi, y, x, ki)
    km = s[CC].fuse(kmo, kmi)

    s[CC].tensorize(ki, dot)

    y, x = C.op.axis
    by, tyz, ty = cfg['tile_y'].apply(s, C, y)
    bx, txz, tx = cfg['tile_x'].apply(s, C, x)

    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].bind(tyz, tvm.thread_axis('vthread'))
    s[C].bind(txz, tvm.thread_axis('vthread'))
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(by, bx, tyz, txz, ty, tx)

    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis

    cfg.define_knob('local_double_buffer', [0, 1])
    for load in [AL, BL]:
        s[load].compute_at(s[CC], km)
        ki = load.op.axis[3]
        s[load].vectorize(ki)

        if cfg['local_double_buffer'].val:
            s[load].double_buffer()

    cfg.define_knob('shared_double_buffer', [0, 1])
    for load in [AA, BB]:
        s[load].compute_at(s[CC], ko)

        fused = s[load].fuse(load.op.axis[2], load.op.axis[3])
        ty, tx = s[load].split(fused, nparts=cfg['tile_y'].size[2])
        tx, xi = s[load].split(tx, nparts=cfg['tile_x'].size[2])
        _, xi = s[load].split(xi, factor=16)

        s[load].bind(ty, thread_y)
        s[load].bind(tx, thread_x)
        s[load].vectorize(xi)

        if cfg['shared_double_buffer'].val:
            s[load].double_buffer

    cfg.define_knob('auto_unroll_max_step', [512, 1500])
    s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
    s[C].pragma(by, 'unroll_explicit', False)

    cfg.add_flop(n * m * l * 2)
    return s, [A, B, C]
Exemplo n.º 38
0
#
#   Now we back to the local machine, which has a full TVM installed
#   (with LLVM).
#
# Here we will declare a simple kernel on the local machine:

import numpy as np

import tvm
from tvm import rpc
from tvm.contrib import util

n = tvm.convert(1024)
A = tvm.placeholder((n, ), name='A')
B = tvm.compute((n, ), lambda i: A[i] + 1.0, name='B')
s = tvm.create_schedule(B.op)

######################################################################
# Then we cross compile the kernel.
# The target should be 'llvm -target=armv7l-linux-gnueabihf' for
# Raspberry Pi 3B, but we use 'llvm' here to make this tutorial runnable
# on our webpage building server. See the detailed note in the following block.

local_demo = True

if local_demo:
    target = 'llvm'
else:
    target = 'llvm -target=armv7l-linux-gnueabihf'

func = tvm.build(s, [A, B], target=target, name='add_one')
Exemplo n.º 39
0
def schedule_depthwise_conv2d_nchw_cuda(cfg, outs):
    """Schedule for depthwise_conv2d nchw forward.

    Parameters
    ----------
    outs: Array of Tensor
        The computation graph description of depthwise_conv2d
        in the format of an array of tensors.

    Returns
    -------
    s: Schedule
        The computation schedule for depthwise_conv2d nchw.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == 'depthwise_conv2d_nchw':
            pad_data = op.input_tensors[0]
            kernel = op.input_tensors[1]
            conv = op.output(0)

            ##### space definition begin #####
            n, f, y, x = s[conv].op.axis
            cfg.define_split("tile_f", f, num_outputs=4)
            cfg.define_split("tile_y", y, num_outputs=4)
            cfg.define_split("tile_x", x, num_outputs=4)
            cfg.define_knob("auto_unroll_max_step", [0, 256, 1500])

            target = tvm.target.current_target()
            if target.target_name in ['nvptx', 'rocm']:
                cfg.define_knob("unroll_explicit", [1])
            else:
                cfg.define_knob("unroll_explicit", [0, 1])

            # fallback support
            if cfg.is_fallback:
                ref_log = autotvm.tophub.load_reference_log(
                    target.target_name, target.model, 'depthwise_conv2d_nchw',
                    'direct')
                cfg.fallback_with_reference_log(ref_log)
                # TODO(lmzheng): A bug here, set unroll_explicit to False as workaround
                cfg['unroll_explicit'].val = 0
            ##### space definition end #####

            s[pad_data].compute_inline()
            if isinstance(kernel.op,
                          tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
                s[kernel].compute_inline()

            if conv.op in s.outputs:
                output = conv
                OL = s.cache_write(conv, 'local')
            else:
                output = s.outputs[0].output(0)
                s[conv].set_scope('local')
                OL = conv

            # create cache stage
            AA = s.cache_read(pad_data, 'shared', [OL])
            WW = s.cache_read(kernel, 'shared', [OL])
            AL = s.cache_read(AA, 'local', [OL])
            WL = s.cache_read(WW, 'local', [OL])

            # tile and bind spatial axes
            n, f, y, x = s[output].op.axis
            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
            by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)

            kernel_scope, n = s[output].split(n, nparts=1)
            bf = s[output].fuse(n, bf)
            s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
            s[output].bind(by, tvm.thread_axis("blockIdx.y"))
            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
            s[output].bind(vf, tvm.thread_axis("vthread"))
            s[output].bind(vy, tvm.thread_axis("vthread"))
            s[output].bind(vx, tvm.thread_axis("vthread"))
            s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
            s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
            s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
            s[OL].compute_at(s[output], tx)

            # cooperative fetching
            s[AA].compute_at(s[output], bx)
            s[WW].compute_at(s[output], bx)
            s[AL].compute_at(s[output], tx)
            s[WL].compute_at(s[output], tx)

            for load in [AA, WW]:
                fused = s[load].fuse(*list(s[load].op.axis))
                fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
                fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
                fused, tz = s[load].split(fused, cfg["tile_f"].size[2])
                s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
                s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))

            s[output].pragma(kernel_scope, 'auto_unroll_max_step',
                             cfg['auto_unroll_max_step'].val)
            s[output].pragma(kernel_scope, 'unroll_explicit',
                             cfg['unroll_explicit'].val)

    traverse_inline(s, outs[0].op, _callback)
    return s
Exemplo n.º 40
0
def schedule_depthwise_conv2d_nhwc(outs):
    """Schedule for depthwise_conv2d nhwc forward.

    Parameters
    ----------
    outs: Array of Tensor
        The computation graph description of depthwise_conv2d
        in the format of an array of tensors.

    Returns
    -------
    s: Schedule
        The computation schedule for depthwise_conv2d nhwc.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

    def _schedule(temp, Filter, DepthwiseConv2d):
        s[temp].compute_inline()
        FS = s.cache_read(Filter, "shared", [DepthwiseConv2d])
        if DepthwiseConv2d.op in s.outputs:
            Output = DepthwiseConv2d
            CL = s.cache_write(DepthwiseConv2d, "local")
        else:
            Output = outs[0].op.output(0)
            s[DepthwiseConv2d].set_scope("local")

        block_x = tvm.thread_axis("blockIdx.x")
        thread_x = tvm.thread_axis("threadIdx.x")

        b, h, w, c = s[Output].op.axis

        # num_thread here could be 728, it is larger than cuda.max_num_threads
        num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value
        target = tvm.target.current_target()
        if target and (target.target_name not in ["cuda", "nvptx"]):
            num_thread = target.max_num_threads
        xoc, xic = s[Output].split(c, factor=num_thread)
        s[Output].reorder(xoc, b, h, w, xic)
        xo, yo, _, _ = s[Output].tile(h, w, x_factor=2, y_factor=2)
        fused = s[Output].fuse(yo, xo)
        fused = s[Output].fuse(fused, b)
        fused = s[Output].fuse(fused, xoc)

        s[Output].bind(fused, block_x)
        s[Output].bind(xic, thread_x)

        if DepthwiseConv2d.op in s.outputs:
            s[CL].compute_at(s[Output], xic)
        else:
            s[DepthwiseConv2d].compute_at(s[Output], xic)

        _, _, ci, fi = s[FS].op.axis
        s[FS].compute_at(s[Output], fused)
        fused = s[FS].fuse(fi, ci)
        s[FS].bind(fused, thread_x)

    scheduled_ops = []

    def traverse(OP):
        """Internal travserse function"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(OP.tag):
            if OP not in s.outputs:
                s[OP].compute_inline()
            for tensor in OP.input_tensors:
                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                    traverse(tensor.op)
        # schedule depthwise_conv2d
        if OP.tag == 'depthwise_conv2d_nhwc':
            PaddedInput = OP.input_tensors[0]
            Filter = OP.input_tensors[1]
            if isinstance(Filter.op,
                          tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
                s[Filter].compute_inline()
            DepthwiseConv2d = OP.output(0)
            _schedule(PaddedInput, Filter, DepthwiseConv2d)

        scheduled_ops.append(OP)

    traverse(outs[0].op)
    return s
Exemplo n.º 41
0
def conv(iw, ih, fw, fh, fi, fo, batch, dtype):
    img = tvm.placeholder((batch, fi, iw, ih), dtype=dtype, name='img')
    fil = tvm.placeholder((fi, fo, fw, fh), dtype=dtype, name='fil')

    conv = topi.nn.conv2d_nchw(img, fil, (1, 1), 'VALID')

    cfg = autotvm.template.DispatchContext.current.query(None, None)

    cfg.add_flop(iw * ih * fw * fh * fi * fo * batch * 2)

    s = tvm.create_schedule(conv.op)

    temp = conv.op.input_tensors[0]
    sch[temp].compute_inline()

    shared_cache = []
    local_cache = []

    # Space definition
    for buf in conv.op.input_tensors:
        shared_cache.append(s.cache_read(buf, "shared", [conv]))
        local_cache.append(s.cache_read(shared_cache[-1], "local", [conv]))
    write_cache = s.cache_write(conv, "local")

    spatial_axes = [cfg.axis(x) for x in s[conv].op.axis]
    spatial_chs = [
        cfg.define_split("tile_" + x.name, x, num_outputs=4)
        for x in spatial_axes
    ]
    re_axes = cfg.define_reorder("re",
                                 reduce(list.__add__, spatial_chs),
                                 policy='interleave',
                                 spatial=spatial_chs,
                                 reduce=[])
    cfg.define_annotate('bind',
                        re_axes[:sum([len(ch) - 1 for ch in spatial_chs])],
                        policy='bind_gpu_virtual')

    reduce_axes = [cfg.axis(x) for x in s[write_cache].op.reduce_axis]
    reduce_chs = [
        cfg.define_split("tile_reduce_" + x.name, x, num_outputs=2)
        for x in reduce_axes
    ]
    cfg.define_annotate("cache_anchor",
                        reduce(list.__add__, reduce_chs),
                        policy='locate_cache',
                        num_anchor=2)

    # Apply on schedule
    spatial_axes = s[conv].op.axis
    spatial_chs = [
        cfg["tile_" + x.var.name].apply(s, conv, x) for x in spatial_axes
    ]
    spatial_lens = [cfg["tile_" + x.var.name].size for x in spatial_axes]

    re_axes = cfg["re"].apply(s, conv, reduce(list.__add__, spatial_chs))
    bind_axes = re_axes[:sum([len(ch) - 1 for ch in spatial_chs])]
    cfg['bind'].apply(s, conv, bind_axes)

    # Cache anchor
    s[write_cache].compute_at(s[conv], bind_axes[-1])

    local_axes = s[write_cache].op.axis
    reduce_axes = s[write_cache].op.reduce_axis
    reduce_chs = [
        cfg["tile_reduce_" + x.var.name].apply(s, write_cache, x)
        for x in reduce_axes
    ]
    s[write_cache].reorder(*(reduce(list.__add__, reduce_chs) +
                             list(local_axes)))
    cfg['cache_anchor'].apply(s,
                              write_cache,
                              reduce(list.__add__, reduce_chs),
                              source=[shared_cache, local_cache])

    re_lens = [reduce(list.__add__, spatial_lens)[x] for x in cfg["re"].perm]
    bind_lens = re_lens[:sum([len(ch) - 1 for ch in spatial_chs])]
    thread_info = []
    for ann, length in zip(cfg['bind'].anns, bind_lens):
        if 'threadIdx' in ann:
            thread_info.append((ann, length))
    thread_info.sort(key=lambda x: x[0])

    for i, cache in enumerate(shared_cache):
        axes = list(s[cache].op.axis)
        fused = s[cache].fuse(*axes)
        for name, length in reversed(thread_info):
            t, fused = s[cache].split(fused, nparts=length)
            s[cache].bind(t, tvm.thread_axis(name))

    return s, [img, fil, conv]
Exemplo n.º 42
0
import tvm
import numpy as np

# 同一个计算有多种不同的计算方式,更会有不同的性能
# Schedule来决定如何计算,schedule是一组计算转换,用于转化程序中的循环计算
# schedule 是由一组opts组成
# 默认情况下,以行优先的串行方式计算
n = tvm.var('n')
m = tvm.var('m')
A = tvm.placeholder((m, n), name='A')
B = tvm.placeholder((m, n), name='B')
C = tvm.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C')

s = tvm.create_schedule([C.op])
# lower会将计算从定义转换为真正的可调用函数。 使用参数`simple_mode = True`,
# 它将返回一个可读的C like语句,我们在此处使用它来打印计划结果。
# print(tvm.lower(s, [A, B, C], simple_mode=True))

# 一个schedule由多个stage组成,一个stage代表一个opt
# 每个stage提供多种方法

# split
# 将特定的一维拆成两维
A = tvm.placeholder((m, ), name='A')
B = tvm.compute((m, ), lambda i: A[i] * 2, name='B')
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=32)
# print(tvm.lower(s, [A, B], simple_mode=True))

s = tvm.create_schedule(B.op)
bx, tx = s[B].split(B.op.axis[0], nparts=32)
Exemplo n.º 43
0
    def check(start, end, dstart, dend, dtype, floor_div=False):
        div = tvm.floordiv if floor_div else tvm.truncdiv
        mod = tvm.floormod if floor_div else tvm.truncmod

        # A are dividends, B are divisors. Note that we add 1 to make include end in the range.
        A = tvm.placeholder((end - start + 1, ), name="A", dtype=dtype)
        B = tvm.placeholder((dend - dstart + 1, ), name="B", dtype=dtype)
        # We clip values with min and max so that simplifiers know the ranges of values
        clipa = lambda x: tvm.min(tvm.const(end, dtype),
                                  tvm.max(tvm.const(start, dtype), x))
        clipb = lambda x: tvm.min(tvm.const(dend, dtype),
                                  tvm.max(tvm.const(dstart, dtype), x))
        # If the range is just a single point, use the constant itself
        if start == end:
            clipa = lambda x: tvm.const(start, dtype)
        if dstart == dend:
            clipb = lambda x: tvm.const(dstart, dtype)
        # D are division results and M are modulo results
        [D, M] = tvm.compute(
            (end - start + 1, dend - dstart + 1), lambda i, j:
            (div(clipa(A[i]), clipb(B[j])), mod(clipa(A[i]), clipb(B[j]))))

        s = tvm.create_schedule([D.op, M.op])
        f = tvm.build(s, [A, B, D, M], "llvm")

        # Fill input arrays with values
        A_arr = tvm.nd.empty((end - start + 1, ), dtype)
        B_arr = tvm.nd.empty((dend - dstart + 1, ), dtype)
        A_arr.copyfrom(np.arange(start, end + 1, dtype=dtype))
        B_np = np.arange(dstart, dend + 1, dtype=dtype)
        # If the range of the divisor contains 0, replace it with 1 to avoid division by zero
        if dend >= 0 and dstart <= 0:
            B_np[-dstart] = 1
        B_arr.copyfrom(B_np)
        D_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype)
        M_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype)

        # Run the function and convert the results to numpy
        f(A_arr, B_arr, D_arr, M_arr)
        D_arr = D_arr.asnumpy()
        M_arr = M_arr.asnumpy()

        # This helper just prints additional info on failure
        def _show_info():
            print("dtype: {}".format(dtype))
            print("dividend range: [{}, {}]".format(start, end))
            print("divisor range: [{}, {}]".format(dstart, dend))
            lowered = tvm.lower(s, [A, B, D, M], simple_mode=True)
            print("Lowered code:")
            print(lowered)

        # Check that the computed values are correct
        for i in range(start, end + 1):
            for j in range(dstart, dend + 1):
                if j == 0:
                    continue

                if floor_div:
                    dref = i // j
                    mref = i % j
                else:
                    dref = int(float(i) / j)
                    mref = int(math.fmod(i, j))

                if D_arr[i - start, j - dstart] != dref:
                    _show_info()
                    raise AssertionError(
                        "Incorrect division result: {}({}, {}) is {} "
                        "but should be {}".format(div.__name__, i, j,
                                                  D_arr[i - start,
                                                        j - dstart], dref))
                if M_arr[i - start, j - dstart] != mref:
                    _show_info()
                    raise AssertionError(
                        "Incorrect modulo result: {}({}, {}) is {} "
                        "but should be {}".format(mod.__name__, i, j,
                                                  M_arr[i - start,
                                                        j - dstart], mref))
Exemplo n.º 44
0
def test_dwarf_debug_information():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((n, ), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    s[C].parallel(xo)
    s[C].vectorize(xi)

    def check_llvm_object():
        if not tvm.runtime.enabled("llvm"):
            return
        if tvm.target.codegen.llvm_version_major() < 5:
            return
        if tvm.target.codegen.llvm_version_major() > 6:
            return
        # build two functions
        f2 = tvm.lower(s, [A, B, C], name="fadd1")
        f1 = tvm.lower(s, [A, B, C], name="fadd2")
        m = tvm.build([f1, f2], "llvm")
        temp = util.tempdir()
        o_path = temp.relpath("temp.o")
        m.save(o_path)
        import re
        import shutil
        import subprocess
        import sys

        # Try the dwarfdump utility (OS X)
        if shutil.which("dwarfdump"):
            output = subprocess.check_output(["dwarfdump", o_path])
            assert re.search(r"""DW_AT_name\\t\("fadd1"\)""", str(output))
            assert re.search(r"""DW_AT_name\\t\("fadd2"\)""", str(output))

        # Try gobjdump (OS X)
        if shutil.which("gobjdump"):
            output = subprocess.check_output(["gobjdump", "--dwarf", o_path])
            assert re.search(r"""DW_AT_name.*fadd1""", str(output))
            assert re.search(r"""DW_AT_name.*fadd2""", str(output))

        # Try objdump (Linux) - Darwin objdump has different DWARF syntax.
        if shutil.which("objdump") and sys.platform != 'darwin':
            output = subprocess.check_output(["objdump", "--dwarf", o_path])
            assert re.search(r"""DW_AT_name.*fadd1""", str(output))
            assert re.search(r"""DW_AT_name.*fadd2""", str(output))

    def check_llvm_ir():
        if not tvm.runtime.enabled("llvm"):
            return
        if tvm.target.codegen.llvm_version_major() < 5:
            return
        if tvm.target.codegen.llvm_version_major() > 6:
            return
        # build two functions
        f2 = tvm.lower(s, [A, B, C], name="fadd1")
        f1 = tvm.lower(s, [A, B, C], name="fadd2")
        m = tvm.build([f1, f2], target="llvm -target=aarch64-linux-gnu")
        ll = m.get_source("ll")

        # On non-Darwin OS, don't explicitly specify DWARF version.
        import re
        assert not re.search(r""""Dwarf Version""" "", ll)
        assert re.search(r"""llvm.dbg.value""", ll)

        # Try Darwin, require DWARF-2
        m = tvm.build([f1, f2],
                      target="llvm -target=x86_64-apple-darwin-macho")
        ll = m.get_source("ll")
        assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll)
        assert re.search(r"""llvm.dbg.value""", ll)

    check_llvm_object()
    check_llvm_ir()
Exemplo n.º 45
0
def schedule_pool(outs, layout):
    """Schedule for pool.

    Parameters
    ----------
    outs: Array of Tensor
        The computation graph description of pool
        in the format of an array of tensors.

    layout: str
        Data layout.

    Returns
    -------
    s: Schedule
        The computation schedule for pool.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])
    def _schedule(PaddedInput, Pool):
        if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
            s[PaddedInput].compute_inline()
        num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
        if Pool.op in s.outputs:
            Out = Pool
            OL = s.cache_write(Pool, "local")
        else:
            Out = outs[0].op.output(0)
            s[Pool].set_scope("local")
        fused = s[Out].fuse(*s[Out].op.axis)
        bx, tx = s[Out].split(fused, factor=num_thread)
        s[Out].bind(bx, tvm.thread_axis("blockIdx.x"))
        s[Out].bind(tx, tvm.thread_axis("threadIdx.x"))
        if Pool.op in s.outputs:
            s[OL].compute_at(s[Out], tx)
        else:
            s[Pool].compute_at(s[Out], tx)

    scheduled_ops = []

    def traverse(OP):
        """Internal traverse function"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(OP.tag):
            if OP not in s.outputs:
                s[OP].compute_inline()
            for tensor in OP.input_tensors:
                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                    traverse(tensor.op)
        # schedule pool
        elif OP.tag.startswith('pool'):
            PaddedInput = OP.input_tensors[0]
            Pool = OP.output(0)
            _schedule(PaddedInput, Pool)
        else:
            raise RuntimeError("Unsupported operator: %s" % OP.tag)

        scheduled_ops.append(OP)

    traverse(outs[0].op)
    return s
Exemplo n.º 46
0
def schedule_conv1d_transpose_ncw_cuda(cfg, outs):
    """TOPI Schedule callback for conv1d_transpose operator.

    Parameters
    ----------
    cfg: ConfigEntity
        The parameters for this template

    outs: Array of Tensor
        The computation graph description of conv1d transpose
        in the format of an array of tensors.

    Returns
    -------
    s: Schedule
        The computation schedule for conv1d transpose.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == 'conv1d_transpose_ncw':
            pad_data = op.input_tensors[0]
            kernel = op.input_tensors[1]
            conv = op.output(0)

            ##### space definition begin #####
            n, f, x = s[conv].op.axis
            rc = s[conv].op.reduce_axis[0]
            cfg.define_split("tile_n", cfg.axis(n), num_outputs=4)
            cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
            cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
            cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
            cfg.define_knob("auto_unroll_max_step", [64, 512, 1500])

            target = tvm.target.current_target()
            if target.target_name in ['nvptx', 'rocm']:
                cfg.define_knob("unroll_explicit", [1])
            else:
                cfg.define_knob("unroll_explicit", [0, 1])

            ##### space definition end #####

            if isinstance(kernel.op,
                          tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
                s[kernel].compute_inline()

            if conv.op in s.outputs:
                output = conv
                OL = s.cache_write(conv, 'local')
            else:
                output = s.outputs[0].output(0)
                s[conv].set_scope('local')
                OL = conv

            # create cache stage
            s[pad_data].set_scope('shared')
            AA = pad_data
            WW = s.cache_read(kernel, 'shared', [OL])

            # tile and bind spatial axes
            n, f, x = s[output].op.axis
            kernel_scope, n = s[output].split(n, nparts=1)
            bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)

            s[output].reorder(bn, bf, bx, vn, vf, vx, tn, tf, tx, ni, fi, xi)
            s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
            s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
            s[output].bind(vn, tvm.thread_axis("vthread"))
            s[output].bind(vf, tvm.thread_axis("vthread"))
            s[output].bind(vx, tvm.thread_axis("vthread"))

            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
            s[OL].compute_at(s[output], tx)
            # number of threads
            n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
            n_tx = cfg["tile_x"].size[2]

            # tile reduction axes
            n, f, x = s[OL].op.axis
            rc, rx = s[OL].op.reduce_axis
            rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
            s[OL].reorder(rco, rcm, rx, rci, n, f, x)

            s[AA].compute_at(s[OL], rx)
            s[WW].compute_at(s[OL], rx)

            # cooperative fetching
            for load in [AA, WW]:
                n, f, x = s[load].op.axis
                fused = s[load].fuse(f, x)
                tz, fused = s[load].split(fused, nparts=n_tz)
                tx, fused = s[load].split(fused, nparts=n_tx)
                s[load].bind(tz, tvm.thread_axis("threadIdx.y"))
                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))

            s[output].pragma(kernel_scope, 'auto_unroll_max_step',
                             cfg['auto_unroll_max_step'].val)
            s[output].pragma(kernel_scope, 'unroll_explicit',
                             cfg['unroll_explicit'].val)

    traverse_inline(s, outs[0].op, _callback)

    return s
import tvm
import numpy as np

m = tvm.var('m')
n = tvm.var('n')
X = tvm.placeholder((m, n), name='X')
s_state = tvm.placeholder((m, n))
s_init = tvm.compute((1, n), lambda _, i: X[0, i])
s_update = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
s_scan = tvm.scan(s_init, s_update, s_state, inputs=[X])

# Schedule the Scan Cell
s = tvm.create_schedule(s_scan.op)
num_thread = 256
block_x = tvm.thread_axis('blockIdx.x')
thread_x = tvm.thread_axis('threadIdx.x')
xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
s[s_init].bind(xo, block_x)
s[s_init].bind(xi, thread_x)
xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
s[s_update].bind(xo, block_x)
s[s_update].bind(xi, thread_x)
print(tvm.lower(s, [X, s_scan], simple_mode=True))

# Build and Verify
f_scan = tvm.build(s, [X, s_scan], 'cuda', name='my_scan')
ctx = tvm.gpu(0)
n = 1024
m = 10
a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
a = tvm.nd.array(a_np, ctx=ctx)
Exemplo n.º 48
0
def schedule_adaptive_pool(outs):
    """Schedule for adaptive_pool.

    Parameters
    ----------
    outs: Array of Tensor
        The computation graph description of adaptive_pool
        in the format of an array of tensors.

    Returns
    -------
    s: Schedule
        The computation schedule for adaptive_pool.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

    def _schedule(Pool):
        num_thread = 8
        block_x = tvm.thread_axis("blockIdx.x")
        block_y = tvm.thread_axis("blockIdx.y")
        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
        if Pool.op in s.outputs:
            Out = Pool
            OL = s.cache_write(Pool, "local")
        else:
            Out = outs[0].op.output(0)
            s[Pool].set_scope("local")
        by, ty = s[Out].split(s[Out].op.axis[0], factor=num_thread)
        bx, tx = s[Out].split(s[Out].op.axis[1], factor=num_thread)
        s[Out].reorder(by, bx, ty, tx)
        s[Out].bind(ty, thread_y)
        s[Out].bind(tx, thread_x)
        s[Out].bind(by, block_y)
        s[Out].bind(bx, block_x)
        if Pool.op in s.outputs:
            s[OL].compute_at(s[Out], tx)
        else:
            s[Pool].compute_at(s[Out], tx)

    scheduled_ops = []

    def traverse(OP):
        """Internal traverse function"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(OP.tag):
            if OP not in s.outputs:
                s[OP].compute_inline()
            for tensor in OP.input_tensors:
                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                    traverse(tensor.op)
        # schedule global_pool
        elif OP.tag.startswith('adaptive_pool'):
            Pool = OP.output(0)
            _schedule(Pool)
        else:
            raise RuntimeError("Unsupported operator: %s" % OP.tag)

        scheduled_ops.append(OP)

    traverse(outs[0].op)
    return s
Exemplo n.º 49
0
def schedule_conv2d(outs):
    """Create schedule for tensors"""
    s = tvm.create_schedule([x.op for x in outs])
    target = tvm.target.current_target(allow_none=False)

    def default_schedule(op):
        """NCHW conv2d schedule for non imagenet workloads"""
        conv = op.output(0)
        kernel = op.input_tensors[1]
        data = op.input_tensors[0]
        data_pad = None
        if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
            data_pad = data
            data = data_pad.op.input_tensors[0]

        n_pad, c_pad, h_pad, w_pad = data_pad.op.axis
        pad_fused = s[data_pad].fuse(n_pad, c_pad)
        s[data_pad].parallel(pad_fused)
        C = conv
        n, c, h, w = C.op.axis
        rc, ry, rx = C.op.reduce_axis
        fused = s[C].fuse(n, c)
        s[C].parallel(fused)
        wo, wi = s[C].split(w, factor=16)
        s[C].reorder(fused, rc, h, wo, ry, rx, wi)  # move rc to outer loop
        s[C].unroll(rx)
        s[C].unroll(ry)
        s[C].vectorize(wi)

    def traverse(op):
        """Traverse operators from computation graph"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(op.tag):
            if op not in s.outputs:
                s[op].compute_inline()
            else:  # inject custom schedule
                if len(op.axis) == 4 and 'avx' not in str(
                        target):  # schedule bias + bn + relu
                    n, c, h, w = op.axis
                    fused = s[op].fuse(n, c)
                    s[op].parallel(fused)
                    s[op].vectorize(w)
            for tensor in op.input_tensors:
                if tensor.op.input_tensors:
                    traverse(tensor.op)

        if 'conv2d_nchw' in op.tag:
            if 'avx' in str(target):
                try:
                    output = op.output(0)
                    conv_out = op.input_tensors[0]
                    kernel_vec = conv_out.op.input_tensors[1]
                    kernel = kernel_vec.op.input_tensors[0]
                    data_vec = conv_out.op.input_tensors[0]
                    data = data_vec.op.input_tensors[0]
                    data_pad = None
                    if isinstance(
                            data.op,
                            tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                        data_pad = data
                        data = data_pad.op.input_tensors[0]
                    padding = infer_pad(data, data_pad)
                    if data_pad is None:
                        stride = infer_stride(data, kernel, output)
                    else:
                        stride = infer_stride(data_pad, kernel, output)

                    wkl = _get_workload(data, kernel, stride, padding,
                                        output.dtype)
                    sch = _get_schedule(wkl)
                    _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad,
                                                    data_vec, kernel,
                                                    kernel_vec, conv_out,
                                                    output, outs[0])
                except IndexError:
                    default_schedule(op)
            else:
                default_schedule(op)

    traverse(outs[0].op)
    return s
Exemplo n.º 50
0
def test_gemm():
    # graph
    nn = 1024
    n = tvm.var('n')
    n = tvm.convert(nn)
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((m, l), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m),
                    lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
                    name='CC')
    # schedule
    s = tvm.create_schedule(C.op)
    xtile, ytile = 32, 32
    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis("threadIdx.y")

    CC = s.cache_write(C, "local")
    AA = s.cache_read(A, "shared", [CC])
    BB = s.cache_read(B, "shared", [CC])
    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].reorder(by, bx, yi, xi)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    ty, yi = s[C].split(yi, nparts=num_thread)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].reorder(ty, tx, yi, xi)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)

    s[CC].compute_at(s[C], tx)
    s[AA].compute_at(s[CC], k)
    s[BB].compute_at(s[CC], k)

    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)

    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)

    # lowering test
    s = s.normalize()

    # one line to build the function.
    def check_device(device):
        if not tvm.module.enabled(device):
            print("skip because %s is not enabled.." % device)
            return

        f = tvm.build(s, [A, B, C], device)
        ctx = tvm.context(device, 0)
        # launch the kernel.
        n = nn
        m = n
        l = n
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        ftimer = f.time_evaluator(f.entry_name, ctx, number=1)
        tcost = ftimer(a, b, c).mean
        print("%s: exec=%g sec/op" % (ctx, tcost))
        np.testing.assert_allclose(c.asnumpy(),
                                   np.dot(a_np, b_np.T),
                                   rtol=1e-5)

    check_device("nvptx -mcpu=sm_20")
    check_device("metal")
    check_device("opencl")
    check_device("cuda")
Exemplo n.º 51
0
def test_llvm_add_pipeline():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((n, ), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    s[C].parallel(xo)
    s[C].vectorize(xi)

    def verify_elf(path, e_machine):
        with open(path, "rb") as fi:
            arr = fi.read(20)
            assert struct.unpack('ccc', arr[1:4]) == (b'E', b'L', b'F')
            endian = struct.unpack('b', arr[0x5:0x6])[0]
            endian = '<' if endian == 1 else '>'
            assert struct.unpack(endian + 'h', arr[0x12:0x14])[0] == e_machine

    def build_i386():
        if not tvm.module.enabled("llvm"):
            print("Skip because llvm is not enabled..")
            return
        temp = util.tempdir()
        target = "llvm -target=i386-pc-linux-gnu"
        f = tvm.build(s, [A, B, C], target)
        path = temp.relpath("myadd.o")
        f.save(path)
        verify_elf(path, 0x03)

    def build_arm():
        target = "llvm -target=armv7-none-linux-gnueabihf"
        if not tvm.module.enabled(target):
            print("Skip because %s is not enabled.." % target)
            return
        temp = util.tempdir()
        f = tvm.build(s, [A, B, C], target)
        path = temp.relpath("myadd.o")
        f.save(path)
        verify_elf(path, 0x28)
        asm_path = temp.relpath("myadd.asm")
        f.save(asm_path)
        # Do a RPC verification, launch kernel on Arm Board if available.
        host = os.environ.get('TVM_RPC_ARM_HOST', None)
        remote = None
        if host:
            port = int(os.environ['TVM_RPC_ARM_PORT'])
            try:
                remote = rpc.connect(host, port)
            except tvm.TVMError as e:
                pass

        if remote:
            remote.upload(path)
            farm = remote.load_module("myadd.o")
            ctx = remote.cpu(0)
            n = nn
            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
            b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
            farm(a, b, c)
            np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
            print("Verification finish on remote..")

    build_i386()
    build_arm()
Exemplo n.º 52
0
def test():
    env = nnpu.get_env()

    shape = (16, 16)
    a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host')
    a = tvm.compute(shape, lambda *i: a_host(*i), name='a')
    a_buf = tvm.compute(shape, lambda *i: a(*i), name='a_buf')

    vctr_shape = (16, )
    b_host = tvm.placeholder(vctr_shape, env.cfg['dtype_n'], 'b_host')
    b = tvm.compute(vctr_shape, lambda *i: b_host(*i), name='b')
    b_buf = tvm.compute(vctr_shape, lambda *i: b(*i), name='b_buf')

    dtype_w = env.cfg['dtype_w']

    out_shape = (16, )
    k = tvm.reduce_axis((0, 16), 'k')
    c_buf = tvm.compute(
        out_shape, lambda i: tvm.sum(
            a_buf[i, k].astype(dtype_w) * b_buf[k].astype(dtype_w), axis=k))

    bias_host = tvm.placeholder(out_shape, env.cfg['dtype_w'], 'bias_host')
    bias = tvm.compute(out_shape, lambda *i: bias_host(*i), 'bias')
    bias_buf = tvm.compute(out_shape, lambda *i: bias(*i), 'bias_buf')
    #c = tvm.compute(out_shape, lambda *i: c_buf(*i), name='c')
    #c_host = tvm.compute(out_shape, lambda *i: c(*i), name='c_host')

    out_buf = tvm.compute(out_shape, lambda i: c_buf[i] + bias_buf[i],
                          'out_buf')
    out = tvm.compute(out_shape, lambda *i: out_buf(*i), 'out')
    out_host = tvm.compute(out_shape, lambda *i: out(*i), 'out_host')

    s = tvm.create_schedule(out_host.op)

    # mark variable scopes
    s[a].set_scope(env.dram_scope)
    s[b].set_scope(env.dram_scope)
    s[bias].set_scope(env.dram_scope)
    s[out].set_scope(env.dram_scope)

    s[a_buf].set_scope(env.uni_scratchpad_scope)
    s[b_buf].set_scope(env.uni_scratchpad_scope)
    s[c_buf].set_scope(env.uni_scratchpad_scope)
    s[bias_buf].set_scope(env.uni_scratchpad_scope)
    s[out_buf].set_scope(env.uni_scratchpad_scope)

    #print(dir(s[b].op.body))

    # mark compiler pragmas
    s[a].pragma(s[a].op.axis[0], env.dma_copy_pragma)
    s[b].pragma(s[b].op.axis[0], env.dma_copy_pragma)
    s[bias].pragma(s[bias].op.axis[0], env.dma_copy_pragma)
    s[out_host].pragma(s[out_host].op.axis[0], env.dma_copy_pragma)

    s[a_buf].pragma(s[a_buf].op.axis[0], env.scratchpad_ls)
    s[b_buf].pragma(s[b_buf].op.axis[0], env.scratchpad_ls)
    s[bias_buf].pragma(s[bias_buf].op.axis[0], env.scratchpad_ls)
    s[out].pragma(s[out].op.axis[0], env.scratchpad_ls)

    #s[a_buf].compute_at(s[b_buf], b_buf.op.axis[0])

    # tensorize
    #s[b_buf].tensorize(s[b_buf].op.axis[1], env.intrins.get('VEXP', mode='inc'))
    s[c_buf].tensorize(
        s[c_buf].op.axis[0],
        env.intrins.get('GEMM', shape=(16, 16, 1), mode='inc', reduce=True))
    #outer, inner = out_buf.op.axis
    #s[out_buf].reorder(inner, outer)
    #print(outer)
    #print(tvm.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True))
    s[out_buf].tensorize(s[out_buf].op.axis[0],
                         env.intrins.get('VAddV', mode='w'))

    # build
    print(tvm.lower(s, [a_host, b_host, bias_host, out_host],
                    simple_mode=True))

    print(
        nnpu.lower(s, [a_host, b_host, bias_host, out_host], simple_mode=True))
    #exit()
    func = nnpu.build(s, [a_host, b_host, bias_host, out_host],
                      'nnpu',
                      'llvm',
                      name='nnpu_exp')

    print('function built: ')
    print('------------------- device module 1 asm code: ')
    print(func.imported_modules[0].get_source('asm'))
    #print(func.get_source())

    # prepare data
    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=0, high=64)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    b_np = np.random.randint(size=vctr_shape,
                             dtype=b_host.dtype,
                             low=0,
                             high=64)
    #b_np = np.random.random(size=vctr_shape).astype(b_host.dtype)
    b_nd = tvm.nd.array(b_np, ctx)

    bias_np = np.random.randint(size=out_shape,
                                dtype=bias_host.dtype,
                                low=0,
                                high=10000)
    #bias_np = np.random.random(size=out_shape).astype(bias_host.dtype)
    bias_nd = tvm.nd.array(bias_np, ctx)

    out_nd = tvm.nd.array(np.zeros(out_shape).astype(out_host.dtype), ctx)

    # run
    func(a_nd, b_nd, bias_nd, out_nd)

    print('run finished')

    print('a=')
    print(a_np)
    print('b=')
    print(b_np)
    print('bias=')
    print(bias_np)
    print('out=')
    print(out_nd.asnumpy())

    print('numpy ground truth is: ')
    gt = np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)) + bias_np
    #gt = np.greater(np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)), bias_np)
    print(gt)

    np.testing.assert_allclose(out_nd.asnumpy(), gt)
Exemplo n.º 53
0
def test():
    env = nnpu.get_env()
    a = tvm.placeholder((32, ), env.cfg['dtype_w'], 'a')
    sph = ScheduleProcHelper()
    Imm = tvm.const(5, env.cfg['dtype_w'])
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    #c_buf = tvm.compute((32,), lambda i: tvm.select(a_buf[i]>Imm,a_buf[i],Imm), 'c_buf')
    c_buf = tvm.compute((32, ), lambda i: Imm + a_buf[i], 'c_buf')
    sph.MarkScope(c_buf)
    c_host, c_dram = nnpu.utils.CopyBufToH(c_buf, 'c', sph)

    sub_buf = tvm.compute((32, ), lambda i: a_buf[i] - Imm, 'sub_buf')
    sph.MarkScope(sub_buf)
    sub_host, sub_dram = nnpu.utils.CopyBufToH(sub_buf, 'sub', sph)

    mul_buf = tvm.compute((32, ), lambda i: a_buf[i] * Imm, 'mul_buf')
    sph.MarkScope(mul_buf)
    mul_host, mul_dram = nnpu.utils.CopyBufToH(mul_buf, 'mul', sph)

    div_buf = tvm.compute((32, ), lambda i: a_buf[i] / Imm, 'rdiv_buf')
    sph.MarkScope(div_buf)
    div_host, div_dram = nnpu.utils.CopyBufToH(div_buf, 'rdiv', sph)

    gtm_buf = tvm.compute((32, ), lambda i: tvm.max(a_buf[i], Imm), 'gtm_buf')
    sph.MarkScope(gtm_buf)
    gtm_host, gtm_dram = nnpu.utils.CopyBufToH(gtm_buf, 'gtm', sph)

    rsub_buf = tvm.compute((32, ), lambda i: Imm - a_buf[i], 'rsub_buf')
    sph.MarkScope(rsub_buf)
    rsub_host, rsub_dram = nnpu.utils.CopyBufToH(rsub_buf, 'rsub', sph)

    s = tvm.create_schedule([
        c_host.op, sub_host.op, mul_host.op, div_host.op, gtm_host.op,
        rsub_host.op
    ])
    sph.Transform(s)
    s[c_buf].tensorize(s[c_buf].op.axis[0],
                       env.intrins.get('VAddI', imm_value=Imm.value, mode='w'))
    s[sub_buf].tensorize(
        s[sub_buf].op.axis[0],
        env.intrins.get('VSubI', imm_value=Imm.value, mode='w'))
    s[mul_buf].tensorize(
        s[mul_buf].op.axis[0],
        env.intrins.get('VMulI', imm_value=Imm.value, mode='w'))
    s[div_buf].tensorize(
        s[div_buf].op.axis[0],
        env.intrins.get('VDivI', imm_value=Imm.value, mode='w'))
    s[gtm_buf].tensorize(
        s[gtm_buf].op.axis[0],
        env.intrins.get('VGTMI', imm_value=Imm.value, mode='w'))
    s[rsub_buf].tensorize(
        s[rsub_buf].op.axis[0],
        env.intrins.get('ISubV', imm_value=Imm.value, mode='w'))
    print(
        nnpu.lower(
            s, [a, c_host, sub_host, mul_host, div_host, gtm_host, rsub_host],
            simple_mode=True))
    func = nnpu.build(
        s, [a, c_host, sub_host, mul_host, div_host, gtm_host, rsub_host],
        'nnpu',
        'llvm',
        name='nnpu_vmuli')

    print('------------------- device module 1 IR: ')
    print(func.imported_modules[0].get_source('ir'))

    print('------------------- device module 1 uop code: ')
    print(func.imported_modules[0].get_source('uop'))

    ctx = tvm.nd.TVMContext(13, 0)

    a_np = np.random.randint(size=(32, ), dtype=a.dtype, low=3, high=122)
    #a_np = np.random.random(size=shape).astype(a_host.dtype)
    a_nd = tvm.nd.array(a_np, ctx)

    c_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx)
    sub_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx)
    mul_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx)
    div_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx)
    gtm_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx)
    rsub_nd = tvm.nd.array(np.zeros((32, )).astype(c_host.dtype), ctx)
    func(a_nd, c_nd, sub_nd, mul_nd, div_nd, gtm_nd, rsub_nd)
    print('a = ')
    print(a_nd.asnumpy())
    print('a + {0} = '.format(Imm.value))
    print(c_nd.asnumpy())
    print('numpy ground truth =')
    gt = a_np + Imm.value
    print(gt)
    np.testing.assert_allclose(c_nd.asnumpy(), gt)

    print('a - {0} = '.format(Imm.value))
    print(sub_nd.asnumpy())
    np.testing.assert_allclose(sub_nd.asnumpy(), a_np - Imm.value)

    print('a * {0} = '.format(Imm.value))
    print(mul_nd.asnumpy())
    np.testing.assert_allclose(mul_nd.asnumpy(), a_np * Imm.value)

    print('a > {0} ? a : {0} = '.format(Imm.value))
    print(gtm_nd.asnumpy())
    #np.testing.assert_allclose(gtm_nd.asnumpy(), a_np  Imm.value)
    print('{0} - a = '.format(Imm.value))
    print(rsub_nd.asnumpy())
    np.testing.assert_allclose(rsub_nd.asnumpy(), Imm.value - a_np)
    print('test passed')
# parallel manner.  TVM asks the user to provide a description of the
# computation called a schedule.
#
# A schedule is a set of transformation of computation that transforms
# the loop of computations in the program.
#
# After we construct the schedule, by default the schedule computes
# C in a serial manner in a row-major order.
#
# .. code-block:: c
#
#   for (int i = 0; i < n; ++i) {
#     C[i] = A[i] + B[i];
#   }
#
s = tvm.create_schedule(C.op)

######################################################################
# We used the split construct to split the first axis of C,
# this will split the original iteration axis into product of
# two iterations. This is equivalent to the following code.
#
# .. code-block:: c
#
#   for (int bx = 0; bx < ceil(n / 64); ++bx) {
#     for (int tx = 0; tx < 64; ++tx) {
#       int i = bx * 64 + tx;
#       if (i < n) {
#         C[i] = A[i] + B[i];
#       }
#     }
Exemplo n.º 55
0
def gemm_tuning(batch, N, L, M):
    bn = 32
    A = tvm.placeholder((batch, N, L), name='A', dtype='float32')
    B = tvm.placeholder((batch, L, M), name='B', dtype='float32')
    packedB = tvm.compute((batch, N / bn, L, bn),
                          lambda b, x, y, z: B[b, y, x * bn + z],
                          name='packedB')

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute(
        (batch, M, N),
        lambda b, x, y: tvm.sum(A[b, x, k] * packedB[b, y / bn, k, y % bn],
                                axis=k),
        name='C')
    s = tvm.create_schedule(C.op)

    ##### define space and schedule
    cfg = autotvm.get_config()

    bn = 32
    CC = s.cache_write(C, 'global')

    factor_range = [2, 4, 8, 16, 32, 64]
    cfg.define_knob('tile_factor_x', factor_range)
    cfg.define_knob('tile_factor_y', factor_range)
    bx = cfg['tile_factor_x'].val
    by = cfg['tile_factor_y'].val
    xo, yo, xi, yi = s[C].tile(C.op.axis[1], C.op.axis[2], bx, by)

    s[CC].compute_at(s[C], yo)
    b, xc, yc = s[CC].op.axis
    k, = s[CC].op.reduce_axis
    """cfg.define_split("split_k", k, num_outputs=2)
    ko, ki = cfg["split_k"].apply(s, CC, k)"""

    k_num_outputs_range = [2, 3, 4, 5, 6, 7, 8]
    cfg.define_knob('k_outputs', k_num_outputs_range)
    k_outputs = cfg['k_outputs'].val
    cfg.define_split("split_k", k, policy='all', num_outputs=k_outputs)
    k_list = cfg["split_k"].apply(s, CC, k)
    cfg.define_reorder("reorder_k", axes=[xc, yc] + k_list, policy='all')
    cfg["reorder_k"].apply(s, CC, [xc, yc] + k_list)
    """cfg.define_reorder("reorder_k", [ko, xc, ki, yc], policy='all')
    cfg["reorder_k"].apply(s, CC, s[CC].op.axis)"""
    # s[CC].reorder(ko, xc, ki, yc)

    k_unroll_id = list(range(k_outputs))
    # print(len(k_list))
    cfg.define_knob('k_unroll', k_unroll_id)
    k_id = cfg['k_unroll'].val
    # print(type(k_id))
    s[CC].unroll(k_list[k_id])
    # s[CC].unroll(ki)

    cfg.define_knob('vector_dim', [0, 1])
    vector_id = cfg['vector_dim'].val
    if vector_id == 0:
        s[CC].vectorize(yc)
    else:
        s[CC].vectorize(xc)
    # s[CC].vectorize(yc)

    parallel_list = [xo, yo, xi, yi]
    cfg.define_knob('parallel_C', list(range(len(parallel_list))))
    parallel_C_id = cfg['parallel_C'].val
    # print(len(parallel_list))
    s[C].parallel(parallel_list[parallel_C_id])
    # s[C].parallel(xo)
    return s, [A, B, C]
Exemplo n.º 56
0
def single_lstm():
    num_gate = 4
    hidden_size = tvm.var('hidden_size')
    batch_size = tvm.var('batch_size')
    input_size = tvm.var('input_size')

    # A single LSTM block operations without unrolling
    # '*' linear transformation
    # '(*)' elementwise multiplication
    # F_t = sigmoid( W_f * x_t + R_f * h_t-1 + b_f )
    # I_t = sigmoid( W_i * x_t + R_i * h_t-1 + b_i )
    # O_t = sigmoid( W_o * x_t + R_o * h_t-1 + b_o )
    # C'_t = tanh( W_c * x_t + R_c * h_t-1 + b_c )
    # C_t = F_t (*) C_t-1 + I_t (*) C'_t
    # h_t = O_t (*) tanh( C_t )

    # Global transition matrix

    # input X[0..t-1]
    X = tvm.placeholder((batch_size, input_size), name="X")
    Prev_h = tvm.placeholder((batch_size, hidden_size), name="Prev_h")
    Prev_c = tvm.placeholder((batch_size, hidden_size), name="Prev_c")

    # Parameters
    # Weight matrices [W_i, W_f, W_o, W_c]: 4 * hidden_size * input_size
    # Bias: 4 * hidden_size
    Wi2h = tvm.placeholder((num_gate, hidden_size, input_size), name="Wi2h")
    Bi2h = tvm.placeholder((num_gate, hidden_size), name="Bi2h")

    # Weight matrices [R_i, R_f, R_o, R_c]: 4 * hidden_size * hidden_size
    # Only handle hidden transition, saves space.
    Wh2h = tvm.placeholder((num_gate, hidden_size, hidden_size), name="Wh2h")
    Bh2h = tvm.placeholder((num_gate, hidden_size), name="Bh2h")

    # LSTM transition
    # [W_i, W_f, W_o, W_c] * X_t: 4 * num_hidden
    l = tvm.reduce_axis((0, input_size), name="li2h")
    i2h = tvm.compute((batch_size, num_gate, hidden_size),
                      lambda i, x, j: tvm.sum(X[i, l] * Wi2h[x, j, l], axis=l),
                      name="i2h")

    # [R_i, R_f, R_o, R_c] * h_t-1: 4 * hidden_size
    # R: hidden_size * hidden_size, h: hidden_size * 1
    k = tvm.reduce_axis((0, hidden_size), name="ki2h")
    h2h = tvm.compute(
        (batch_size, num_gate, hidden_size),
        lambda i, x, j: tvm.sum(Prev_h[i, k] * Wh2h[x, j, k], axis=k),
        name="h2h")

    gates = tvm.compute(
        (batch_size, num_gate, hidden_size),
        lambda i, j, k: i2h[i, j, k] + h2h[i, j, k] + Bi2h[j, k] + Bh2h[j, k],
        name="gates")
    gshape = (batch_size, hidden_size)
    in_gate = tvm.compute(gshape,
                          lambda i, j: tvm.sigmoid(gates[i, 0, j]),
                          name="in_gate")
    forget_gate = tvm.compute(gshape,
                              lambda i, j: tvm.sigmoid(gates[i, 1, j]),
                              name="forget_gate")
    out_gate = tvm.compute(gshape,
                           lambda i, j: tvm.sigmoid(gates[i, 2, j]),
                           name="out_gate")
    in_transform = tvm.compute(gshape,
                               lambda i, j: tvm.tanh(gates[i, 3, j]),
                               name="in_transform")

    # C_t = F_t o C_t-1 + I_t o C'_t
    state_c = tvm.compute((batch_size, hidden_size),
                          lambda i, j: forget_gate[i, j] * Prev_c[i, j] +
                          in_gate[i, j] * in_transform[i, j],
                          name="state_c")
    # h_t = O_t o tanh( C_t )
    # state_h = tvm.compute((batch_size, hidden_size),
    #    lambda i, j: out_gate[i, j] * tvm.tanh(state_c[i, j]), name="state_h")
    out_c, out_h = tvm.compute(
        (batch_size, hidden_size),
        lambda i, j: (state_c[i, j], out_gate[i, j] * tvm.tanh(state_c[i, j])),
        name="outputs_c_h")
    # schedule
    s = tvm.create_schedule(out_h.op)
    print(
        tvm.lower(s, [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h],
                  simple_mode=True))
    lstm = tvm.build(s,
                     [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h],
                     name="single_lstm")
    print(lstm)

    lstm.save("remy_single_lstm.o")
    print(lstm.imported_modules)
    cc.create_shared("remy_single_lstm.so", ["remy_single_lstm.o"])
Exemplo n.º 57
0
def schedule_bitserial_dense(cfg, outs):
    """Schedule for binary_dense.

    Parameters
    ----------
    outs: Array of Tensor
        The computation graph description of bitserial dense operator.
        in the format of an array of tensors.

    Returns
    -------
    s: Schedule
        The computation schedule for bitserial_dense.
    """
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

    def _schedule(cfg, s, data_vec, weight_vec, output, unipolar):

        z, k, _, y, x = s[weight_vec].op.axis
        s[weight_vec].parallel(z)
        s[weight_vec].vectorize(x)

        x, y = s[output].op.axis
        wb, db, k = s[output].op.reduce_axis
        _, DB, _ = get_const_tuple(data_vec.shape)
        _, _, WB, _, _ = get_const_tuple(weight_vec.shape)

        yo, yi = cfg["tile_y"].apply(s, output, y)
        xo, xi = cfg["tile_x"].apply(s, output, x)
        ko, ki = cfg["tile_k"].apply(s, output, k)

        cfg["reorder_0"].apply(s, output, [yo, xo, ko, xi, wb, db, yi, ki])

        fused = s[output].fuse(xo, yo)
        s[output].parallel(fused)

        nfactor = cfg['tile_y'].size[-1]
        kfactor = cfg['tile_k'].size[-1]
        if nfactor % 8 == 0:
            pc = _intrin_popcount(nfactor, kfactor, WB, DB, unipolar)
            s[output].tensorize(wb, pc)

        return s

    def traverse(op):
        """Internal travserse function"""
        # inline all one-to-one-mapping operators except the last stage (output)
        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag:
            if op not in s.outputs:
                s[op].compute_inline()
            for tensor in op.input_tensors:
                if isinstance(tensor.op, tvm.tensor.ComputeOp):
                    traverse(tensor.op)

        elif op.tag == 'bitserial_dense' or 'bitserial_dense_unipolar':
            output = op.output(0)
            weight_vec = op.input_tensors[0]

            data_vec = op.input_tensors[1]
            data = data_vec.op.input_tensors[0]
            if "QuantizeInput" in data.op.name:
                data = data.op.input_tensors[0]
            unipolar = (output.op.tag == 'bitserial_dense_unipolar')
            _schedule(cfg, s, data_vec, weight_vec, output, unipolar)
        else:
            raise RuntimeError("Unsupported operator: %s" % op.tag)

    traverse(outs[0].op)
    return s
Exemplo n.º 58
0
def show_lowered(outputs, inputs):
    sout = tvm.create_schedule([o.op for o in outputs])
    mout = tvm.lower(sout, outputs + inputs, simple_mode=True)
    print(mout)
Exemplo n.º 59
0
Arquivo: peak.py Projeto: zheng-da/tvm
def measure_compute_mad(total_item, item_per_thread, base_type, bits, lanes,
                        target, target_host, remote, ctx, n_times):
    """ measure peak compute speed by computing mad for a type

    The IR for measurement is

    for each thread
        for i in 1..item_per_thread
            x = mad(x, x, y)
            y = mad(y, y, x)

    Parameters
    ----------
    total_item: int
        number of elements in input array
    item_per_thread: int
        number of operations each thread does
    base_type: str
        can be "int", "float"
    bits: int
        can be 16, 32
    lanes: int
       lane of the vector type, can be 1, 2, 4, 8, 16
    target: :any:`tvm.target.Target`
        the target and option of the compilation.
    target_host : str or :any:`tvm.target.Target`
        host compilation target
    remote: tvm.contrib.rpc.RPCSession
        if it is not None, use remote rpc session
    ctx: TVMcontext
        the context of array
    n_times: int
        number of runs for taking mean

    Returns
    -------
    GOPS: float
         giga operation per second
    """

    n = total_item

    if bits >= 64 or lanes >= 16:
        n //= 2

    max_threads = target.max_num_threads

    base_type = str(base_type) + str(bits)
    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)

    def extern(ins, outs):
        # pylint: disable=unused-argument
        """construct measurement function by building IR directly"""
        ib = tvm.ir_builder.create()

        bx = tvm.thread_axis("blockIdx.x")
        tx = tvm.thread_axis("threadIdx.x")

        ib.scope_attr(bx, "thread_extent", n // max_threads)
        ib.scope_attr(tx, "thread_extent", max_threads)

        idx = bx.var * max_threads + tx.var

        a = ib.allocate(dtype, (1), name='a', scope='local')
        b = ib.allocate(dtype, (1), name='b', scope='local')

        a[0] = outs[0].vload(idx, dtype)
        b[0] = outs[0].vload(idx, dtype)

        if base_type.find('float') != -1:
            mad_func = lambda x, y: (x * x + y)
        else:
            mad_func = lambda x, y: y * y + x

        for _ in range(item_per_thread // 4 // lanes):
            a[0] = mad_func(a[0], b[0])
            b[0] = mad_func(b[0], a[0])

        ib.emit(outs[0].vstore(idx, b[0]))
        return ib.get()

    y = tvm.extern((n,), [], extern, name="y", dtype=dtype)
    s = tvm.create_schedule(y.op)

    try:
        func = tvm.build(s, [y], target, target_host=target_host)
        func = _convert_to_remote(func, remote)
        time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
        y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
        time = time_f(y).mean
    except tvm._ffi.base.TVMError:
        # build error (occur when device does not support half)
        return -1

    return 1.0 * (n * item_per_thread) / 1e9 / time
Exemplo n.º 60
0
import numpy as np

N = tvm.var('N') # Data set size
V = tvm.var('V') # Feature number
C = tvm.var('C') # Center number

data = tvm.placeholder((N, V), name='data')
center = tvm.placeholder((C, V), name='center')

# === Start computation
# Compute distances
rv = tvm.reduce_axis((0, V), name='rv')
dis = tvm.compute((N, C), lambda n, c: tvm.sum(
    (data[n, rv]-center[c, rv]).astype('float64')*
    (data[n, rv]-center[c, rv]).astype('float64'), axis=rv),
    name='dis')

rc = tvm.reduce_axis((0, C), name='rc')
mse_n = tvm.compute((N,), lambda n: tvm.sum(dis[n, rc], axis=rc), name='mse_n')
rn = tvm.reduce_axis((0, N), name='rn')
mse = tvm.compute((1,), lambda i: tvm.sum(mse_n[rn], axis=rn), name='mse')

# === End computation

# Scheduling
s = tvm.create_schedule(mse.op)

# Compilation
calc = tvm.build(s, [data, center, mse])
assert calc