def intrin_func(ins, outs): x_ptr = ins[0].access_ptr("r") y_ptr = ins[1].access_ptr("r") z_ptr = outs[0].access_ptr("w") body = tvm.call_packed("gemv", x_ptr, y_ptr, z_ptr, m, n, l) reset = tvm.call_packed("fill_zero", z_ptr, m, n) update = tvm.call_packed("gemv_add", x_ptr, y_ptr, z_ptr, m, n, l) return body, reset, update
def intrin_func(ins, outs): ww, xx = ins zz = outs[0] ww_ptr = ww.access_ptr("r") xx_ptr = xx.access_ptr("r") zz_ptr = zz.access_ptr("w") body = tvm.call_packed( "gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) update = tvm.call_packed( "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) return body, None, update
def intrin_func(ins, outs): ww, xx = ins zz = outs[0] ww_ptr = ww.access_ptr("r") xx_ptr = xx.access_ptr("r") zz_ptr = zz.access_ptr("w") body = tvm.call_packed("gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) update = tvm.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) return body, None, update
def test_sort_np(): dshape = (1, 2, 3, 4, 5, 6) axis = 4 reduced_shape = (1, 2, 3, 4, 6) is_descend = False data = tvm.placeholder(dshape, name='data') sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32") out = tvm.extern( data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed("tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
def test_sort(): n = 2 l = 5 m = 3 data = tvm.placeholder((n, l, m), name='data') sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_descend = True out = tvm.extern( data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed("tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]] ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): """Get the top k elements in an input tensor along the given axis. Parameters ---------- data : tvm.Tensor The input tensor. k : int, optional Number of top elements to select. Return all elements if k < 1. axis : int, optional Axis long which to sort the input tensor. ret_type: str, optional The return type [both, values, indices]. "both": return both top k data and indices. "values": return top k data only. "indices": return top k indices only. is_ascend : boolean, optional Whether to sort in ascending or descending order. dtype : string, optional The data type of the indices output. Returns ------- out : tvm.Tensor or List[tvm.Tensor] The computed result. """ assert ret_type in ["both", "values", "indices"] data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) out_shape = list(get_const_tuple(data.shape)) if k >= 1: out_shape[axis] = k out_bufs = [] if ret_type in ["both", "values"]: out_bufs.append( api.decl_buffer(out_shape, data.dtype, "value_buf", data_alignment=8)) if ret_type in ["both", "indices"]: out_bufs.append( api.decl_buffer(out_shape, dtype, "indices_buf", data_alignment=8)) out_shapes = [out_shape] * len(out_bufs) out = tvm.extern( out_shapes, [data], lambda ins, outs: tvm.call_packed("tvm.contrib.sort.topk", ins[0], * outs, k, axis, ret_type, is_ascend), in_buffers=[data_buf], out_buffers=out_bufs, name="topk_cpu", tag="topk_cpu") return out
def test_sort(): n = 2 l = 5 m = 3 data = tvm.placeholder((n, l, m), name='data') sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32") axis = 1 is_descend = True out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]], [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]] sort_num_input = [[1, 2, 3], [4, 5, 5]] sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]], [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]] ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) a = tvm.nd.array(np.array(input).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
def test_sort_np(): dshape = (1, 2, 3, 4, 5, 6) axis = 4 reduced_shape = (1, 2, 3, 4, 6) is_descend = False data = tvm.placeholder(dshape, name='data') sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32") out = tvm.extern(data.shape, [data, sort_num], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], axis, is_descend), dtype='int32', name="sort_tensor") ctx = tvm.cpu(0) target = "llvm" s = tvm.create_schedule(out.op) f = tvm.build(s, [data, sort_num, out], target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) sort_num_input = np.full(reduced_shape, dshape[axis]) a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
def intrin_func(ins, outs, sp): assert (isinstance(ins[0], tvm.schedule.Buffer)) assert (ins[0].shape[0] == n) assert (sp[0] == v) assert (sp[1] == w) return tvm.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1])
def test_min_repeat_ms(): tmp = tempdir() filename = tmp.relpath("log") @tvm.register_func def my_debug(filename): """one call lasts for 100 ms and writes one character to a file""" time.sleep(0.1) with open(filename, "a") as fout: fout.write("c") X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename)) s = tvm.create_schedule(X.op) func = tvm.build(s, [X]) x = tvm.nd.empty((), dtype="int32") ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1) ftimer(x) with open(filename, "r") as fin: ct = len(fin.readline()) assert ct == 2 ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1, min_repeat_ms=1000) ftimer(x) # make sure we get more than 10 calls with open(filename, "r") as fin: ct = len(fin.readline()) assert ct > 10 + 2
def test_stack_vm_basic(): a = tvm.nd.array(np.zeros(10, dtype='float32')) @tvm.register_func def tvm_call_back_get_shape(shape0): print(shape0) assert shape0 == a.shape[0] n = tvm.var('n') Ab = tvm.decl_buffer((n, ), tvm.float32) stmt = tvm.make.Evaluate(tvm.call_packed("tvm_call_back_get_shape", Ab.shape[0])) fapi = tvm.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) run_jit(fapi, lambda f: f(a))
def test_stack_vm_basic(): a = tvm.nd.array(np.zeros(10, dtype='float32')) @tvm.register_func def tvm_call_back_get_shape(shape0): print(shape0) assert shape0 == a.shape[0] n = tvm.var('n') Ab = tvm.decl_buffer((n, ), tvm.float32) stmt = tvm.make.Evaluate( tvm.call_packed("tvm_call_back_get_shape", Ab.shape[0])) fapi = tvm.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) run_jit(fapi, lambda f: f(a))
def lesson1(): ###################################################################### # Use Extern Tensor Function # -------------------------- # In the example below, we use :any:`tvm.extern` to add an extern # array function call. In the extern call, we declare the shape # of output tensors. In the second argument we provide the list of inputs. # # User will need to provide a function describing how to compute the result. # The compute function takes list of symbolic placeholder for the inputs, # list of symbolic placeholder for the outputs and returns the executing statement. # # In this case we simply call a registered tvm function, which invokes a CBLAS call. # TVM does not control internal of the extern array function and treats it as blackbox. # We can further mix schedulable TVM calls that add a bias term to the result. # n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = tvm.extern( (n, m), [A, B], lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], False, False), name="C") D = tvm.compute(C.shape, lambda i, j: C[i, j] + bias, name="D") s = tvm.create_schedule(D.op) ###################################################################### # Verify the Result # ----------------- # We can verify that the result matches what we expected. # ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], "llvm") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 f(a, b, d, bb) np.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
def trace1(): @tvm.register_func def my_debug(x): print("array=", x.asnumpy()) return 0 x = tvm.placeholder((4, ), name="x", dtype="int32") xbuffer = tvm.decl_buffer(x.shape, dtype=x.dtype) y = tvm.compute(x.shape, lambda i: tvm.call_packed("my_debug", xbuffer)) s = tvm.create_schedule(y.op) print(tvm.lower(s, [x, y], binds={x: xbuffer}, simple_mode=True)) f = tvm.build(s, [xbuffer, y], binds={x: xbuffer}) xnd = tvm.nd.array(np.ones((4, ), dtype=x.dtype)) ynd = tvm.nd.array(np.zeros((4, ), dtype=y.dtype)) f(xnd, ynd) print(ynd)
def test_stack_vm_loop(): dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.var('i') ib = tvm.ir_builder.create() A = ib.buffer_ptr(Ab) with ib.for_range(0, n - 1, "i") as i: A[i + 1] = A[i] + 1 ib.emit(tvm.call_packed("tvm_stack_vm_print", i)) stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) a = tvm.nd.array(np.zeros(10, dtype=dtype)) def check(f): f(a) np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0])) run_jit(fapi, check)
def test_static_init(): dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.var('i') ib = tvm.ir_builder.create() handle = tvm.call_intrin("handle", "tvm_static_handle") ib.emit(tvm.call_packed("test_static_callback", handle, Ab)) @tvm.register_func("test_static_callback") def test_cb(sh, A): assert isinstance(sh, ctypes.c_void_p) return sh stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) f = tvm.codegen.build_module(fapi, "llvm") a = tvm.nd.array(np.zeros(10, dtype=dtype)) f(a)
def test_static_init(): dtype = 'int64' n = tvm.var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.var('i') ib = tvm.ir_builder.create() handle = tvm.call_intrin("handle", "tvm_static_handle") ib.emit( tvm.call_packed("test_static_callback", handle, Ab)) @tvm.register_func("test_static_callback") def test_cb(sh, A): assert isinstance(sh, ctypes.c_void_p) return sh stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) f = tvm.codegen.build_module(fapi, "llvm") a = tvm.nd.array(np.zeros(10, dtype=dtype)) f(a)
def test_min_repeat_ms(): tmp = tempdir() filename = tmp.relpath("log") @tvm.register_func def my_debug(filename): """one call lasts for 100 ms and writes one character to a file""" time.sleep(0.1) filename = ctypes.c_char_p(filename.value).value with open(filename, "a") as fout: fout.write("c") X = tvm.compute((), lambda: tvm.call_packed("my_debug", filename)) s = tvm.create_schedule(X.op) func = tvm.build(s, [X]) x = tvm.nd.empty((), dtype="int32") ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1) ftimer(x) with open(filename, "r") as fin: ct = len(fin.readline()) assert ct == 2 ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1, min_repeat_ms=1000) ftimer(x) # make sure we get more than 10 calls with open(filename, "r") as fin: ct = len(fin.readline()) assert ct > 10 + 2
def test_stack_vm_loop(): dtype = 'int64' n = tvm.size_var('n') Ab = tvm.decl_buffer((n, ), dtype) i = tvm.size_var('i') ib = tvm.ir_builder.create() A = ib.buffer_ptr(Ab) with ib.for_range(0, n - 1, "i") as i: A[i + 1] = A[i] + 1 ib.emit(tvm.call_packed("tvm_stack_vm_print", i)) stmt = ib.get() fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True) fapi = tvm.ir_pass.LowerTVMBuiltin(fapi) a = tvm.nd.array(np.zeros(10, dtype=dtype)) def check(f): f(a) np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0])) run_jit(fapi, check)
def main(): ctx = tvm.cpu(0) n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = tvm.extern( (n, m), [A, B], lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], False, False), name="C") D = tvm.compute(C.shape, lambda i, j: C(i, j) + bias, name="D") s = tvm.create_schedule(D.op) f = tvm.build(s, [A, B, D, bias], "llvm") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0 print(d.asnumpy()) tvm.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline.""" return tvm.call_packed("my_extern_array_func2", ins[0], outs[0])
def intrin_func(ins, outs): xx, = ins zz = outs[0] return tvm.call_packed("op", xx, zz)
def intrin_func(ins, outs): return tvm.call_packed("multivadd")
def intrin_func(ins, outs): dinp = ins[0] dout = outs[0] return tvm.call_packed("op", dinp, dout)
def intrin_func(ins, outs): xx, yy = ins zz = outs[0] return tvm.call_packed("vadd", xx, yy, zz)
def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1): """Non-maximum suppression operator for object detection. Parameters ---------- data: tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. valid_count : tvm.Tensor 1-D tensor for valid number of boxes. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. Example -------- .. code-block:: python # An example to use nms dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") nms_threshold = 0.7 force_suppress = True nms_topk = -1 out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "llvm") ctx = tvm.cpu() tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) score_axis = 1 score_shape = (batch_size, num_anchors) score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis]) score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype, "score_tensor_buf", data_alignment=8) sort_tensor_dtype = "int32" sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, "sort_tensor_buf", data_alignment=8) sort_tensor = \ tvm.extern(score_shape, [score_tensor, valid_count], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], score_axis, True), dtype=sort_tensor_dtype, in_buffers=[score_tensor_buf, valid_count_buf], out_buffers=sort_tensor_buf, name="nms_sort") out = \ tvm.extern(data.shape, [data, sort_tensor, valid_count], lambda ins, outs: nms_ir( ins[0], ins[1], ins[2], outs[0], nms_threshold, force_suppress, nms_topk), dtype="float32", in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], tag="nms") return out
def intrin_func(ins, outs): assert(isinstance(ins[0], tvm.schedule.Buffer)) assert(ins[0].shape[0].value == n) return tvm.call_packed("vadd", ins[0].data, outs[0].data, ins[0].shape[0])
# The compute function takes list of symbolic placeholder for the inputs, # list of symbolic placeholder for the outputs and returns the executing statement. # # In this case we simply call a registered TVM function, which invokes a CBLAS call. # TVM does not control internal of the extern array function and treats it as blackbox. # We can further mix schedulable TVM calls that add a bias term to the result. # n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = tvm.extern((n, m), [A, B], lambda ins, outs: tvm.call_packed( "tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], False, False), name="C") D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D") s = tvm.create_schedule(D.op) ###################################################################### # Verify the Result # ----------------- # We can verify that the result matches what we expected. # ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], "llvm") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0
def extern_generator(ins, outs): """Manually write the IR for the extern function, add pipeline.""" return tvm.call_packed("my_extern_array_func1", ins[0], outs[0])
# f = tvm.build(s, [A, B, bias,D], 'llvm') a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx=ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx=ctx) d = tvm.nd.array(np.zeros(shape=(n, m), dtype=D.dtype), ctx=ctx) bb = 10.0 f(a, b, d, bb) np.testing.assert_allclose(d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5) print(d.shape) @tvm.register_func('tvm.contrib.my_tvm_add_one') def my_tvm_add_one(x, y): print('my tvm add one signatures :%s, %s' % (type(x), type(y))) tvm.nd.array(x.asnumpy() + 1).copyto(y) A = tvm.placeholder((n, ), name='A') B = tvm.extern(A.shape, [A], lambda ins, outs: tvm.call_packed('tvm.contrib.my_tvm_add_one', ins[0], outs[0]), name='C') s = tvm.create_schedule(B.op) f = tvm.build(s, [A, B], 'llvm') a = tvm.nd.array(np.random.uniform(size=(n, )).astype(A.dtype), ctx=ctx) b = tvm.nd.array(np.random.uniform(size=(n, )).astype(B.dtype), ctx=ctx) f(a, b) np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5) print(b.shape)
def argsort(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0): """Performs sorting along the given axis and returns an array of indices having the same shape as an input array that index data in sorted order. Parameters ---------- data : tvm.Tensor The input tensor. valid_count : tvm.Tensor 1-D tensor for valid number of boxes only for ssd. axis : optional, int Axis along which to sort the input tensor. By default the flattened array is used. is_ascend : optional, boolean Whether to sort in ascending or descending order. dtype : optional, string DType of the output indices. flag : optional, boolean Whether valid_count is valid. Returns ------- out : tvm.Tensor Sorted index tensor. Example -------- .. code-block:: python # An example to use argsort dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") axis = 0 is_ascend = False flag = False out = argsort(data, valid_count, axis, is_ascend, flag) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_argsort(out) f = tvm.build(s, [data, valid_count, out], "llvm") ctx = tvm.cpu() tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) if flag: valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4) out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=8) out = \ tvm.extern(data.shape, [data, valid_count], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort_nms", ins[0], ins[1], outs[0], axis, is_ascend), dtype="int32", in_buffers=[data_buf, valid_count_buf], out_buffers=out_buf, name="argsort_nms_cpu", tag="argsort_nms_cpu") else: out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) out = \ tvm.extern(data.shape, [data], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], outs[0], axis, is_ascend), dtype=dtype, in_buffers=[data_buf], out_buffers=out_buf, name="argsort_cpu", tag="argsort_cpu") return out
def extern_func(ins, outs): assert(isinstance(ins[0], tvm.schedule.Buffer)) return tvm.call_packed( "myadd", ins[0].data, outs[0].data, outs[1].data, m)
# The compute function takes list of symbolic placeholder for the inputs, # list of symbolic placeholder for the outputs and returns the executing statement. # # In this case we simply call a registered tvm function, which invokes a CBLAS call. # TVM does not control internal of the extern array function and treats it as blackbox. # We can further mix schedulable TVM calls that add a bias term to the result. # n = 1024 l = 128 m = 235 bias = tvm.var('bias', dtype=tvm.float32) A = tvm.placeholder((n, l), name='A') B = tvm.placeholder((l, m), name='B') C = tvm.extern( (n, m), [A, B], lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0], ins[ 1], outs[0], False, False), name="C") D = tvm.compute(C.shape, lambda i, j: C[i, j] + bias, name="D") s = tvm.create_schedule(D.op) ###################################################################### # Verify the Result # ----------------- # We can verify that the result matches what we expected. # ctx = tvm.cpu(0) f = tvm.build(s, [A, B, D, bias], "llvm") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx) bb = 10.0
def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1): """Non-maximum suppression operator for object detection. Parameters ---------- data: tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. valid_count : tvm.Tensor 1-D tensor for valid number of boxes. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. Example -------- .. code-block:: python # An example to use nms dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") nms_threshold = 0.7 force_suppress = True nms_topk = -1 out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk) np_data = np.random.uniform(size=dshape).astype("float32") np_valid_count = np.array([4]).astype("int32") s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "llvm") ctx = tvm.cpu() tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) score_axis = 1 score_shape = (batch_size, num_anchors) score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis]) score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype, "score_tensor_buf", data_alignment=8) sort_tensor_dtype = "int32" sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, "sort_tensor_buf", data_alignment=8) sort_tensor = \ tvm.extern(score_shape, [score_tensor, valid_count], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], score_axis, True), dtype=sort_tensor_dtype, in_buffers=[score_tensor_buf, valid_count_buf], out_buffers=sort_tensor_buf, name="nms_sort") out = \ tvm.extern(data.shape, [data, sort_tensor, valid_count], lambda ins, outs: nms_ir( ins[0], ins[1], ins[2], outs[0], nms_threshold, force_suppress, nms_topk), dtype="float32", in_buffers=[data_buf, sort_tensor_buf, valid_count_buf], tag="nms") return out
def non_max_suppression(data, valid_count, max_output_size=-1, iou_threshold=0.5, force_suppress=False, top_k=-1, id_index=0, return_indices=True, invalid_to_bottom=False): """Non-maximum suppression operator for object detection. Parameters ---------- data : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. The last dimension should be in format of [class_id, score, box_left, box_top, box_right, box_bottom]. valid_count : tvm.Tensor 1-D tensor for valid number of boxes. max_output_size : optional, int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : optional, float Non-maximum suppression threshold. force_suppress : optional, boolean Whether to suppress all detections regardless of class_id. top_k : optional, int Keep maximum top k detections before nms, -1 for no limit. id_index : optional, int index of the class categories, -1 to disable. return_indices : optional, boolean Whether to return box indices in input data. invalid_to_bottom : optional, boolean Whether to move all valid bounding boxes to the top. Returns ------- out : tvm.Tensor 3-D tensor with shape [batch_size, num_anchors, 6]. Example -------- .. code-block:: python # An example to use non_max_suppression dshape = (1, 5, 6) data = tvm.placeholder(dshape, name="data") valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count") iou_threshold = 0.7 force_suppress = True top_k = -1 out = non_max_suppression(data, valid_count, iou_threshold=iou_threshold, force_suppress=force_suppress, top_k=top_k) np_data = np.random.uniform(dshape) np_valid_count = np.array([4]) s = topi.generic.schedule_nms(out) f = tvm.build(s, [data, valid_count, out], "llvm") ctx = tvm.cpu() tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ batch_size = data.shape[0] num_anchors = data.shape[1] valid_count_dtype = "int32" valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4) score_axis = 1 score_shape = (batch_size, num_anchors) score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis]) score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype, "score_tensor_buf", data_alignment=8) sort_tensor_dtype = "int32" sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, "sort_tensor_buf", data_alignment=8) sort_tensor = \ tvm.extern(score_shape, [score_tensor, valid_count], lambda ins, outs: tvm.call_packed( "tvm.contrib.sort.argsort", ins[0], ins[1], outs[0], score_axis, True), dtype=sort_tensor_dtype, in_buffers=[score_tensor_buf, valid_count_buf], out_buffers=sort_tensor_buf, name="nms_sort") out, box_indices = hybrid_nms(data, sort_tensor, valid_count, tvm.const(max_output_size, dtype="int32"), tvm.const(iou_threshold, dtype="float32"), tvm.const(force_suppress, dtype="bool"), tvm.const(top_k, dtype="int32"), tvm.const(id_index, dtype="int32")) if not return_indices and invalid_to_bottom: out = hybrid_rearrange_out(out) return box_indices if return_indices else out
import tvm import numpy as np @tvm.register_func def my_debug(x): print("array=", x.asnumpy()) return 0 x = tvm.placeholder((4, ), name="x", dtype="int32") xbuffer = tvm.decl_buffer(x.shape, dtype=x.dtype) y = tvm.compute(x.shape, lambda i: tvm.call_packed("my_debug", xbuffer)) s = tvm.create_schedule(y.op) print(tvm.lower(s, [x, y], binds={x: xbuffer}, simple_mode=True)) f = tvm.build(s, [xbuffer, y], binds={x: xbuffer}) xnd = tvm.nd.array(np.ones((4, ), dtype=x.dtype)) ynd = tvm.nd.array(np.zeros((4, ), dtype=y.dtype)) f(xnd, ynd)