def _alter_conv3d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current _, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv3d"), attrs, tinfos, out_type, target) workload = autotvm.task.get_workload(outs) if workload is None: # The best implementation is not an AutoTVM template, # we then assume it's not necessary to alter this op. return None cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None topi_tmpl = workload[0] new_attrs = {k: attrs[k] for k in attrs.keys()} strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int('groups') data_layout = attrs["data_layout"] kernel_layout = attrs["kernel_layout"] data, kernel = tinfos out_dtype = out_type.dtype if topi_tmpl == "conv3d_ncdhw_winograd.cuda": if dilation != (1, 1, 1): logger.warning("Does not support weight pre-transform for dilated 3D convolution.") return None assert data_layout == "NCDHW" and kernel_layout == "OIDHW" N, CI, D, H, W = get_const_tuple(data.shape) CO, _, KD, KH, KW = get_const_tuple(kernel.shape) # Pre-compute weight transformation in winograd tile_size = _infer_tile_size(tinfos[0], tinfos[1]) weight = relay.nn.contrib_conv3d_winograd_weight_transform(inputs[1], tile_size=tile_size) new_attrs['tile_size'] = tile_size new_attrs['channels'] = CO # Store the same config for the altered operators (workload) new_data = data # Check if depth is transformed or not if 2 < KD < 8 and KD == KH: new_weight = te.placeholder( (KD + tile_size - 1, KH + tile_size - 1, KW + tile_size - 1, CO, CI), dtype=kernel.dtype) else: new_weight = te.placeholder( (KH + tile_size - 1, KW + tile_size - 1, KD, CO, CI), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_weight, strides, padding, dilation, out_dtype], "conv3d_ncdhw_winograd_without_weight_transform.cuda") dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv3d_winograd_without_weight_transform( inputs[0], weight, **new_attrs) return None
def verify_conv2d_NCHWc_int8( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype="int8") W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype="int8") bias = te.placeholder( (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype="int8") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype) # convert to NCHWc _, _, out_height, out_width = c_np.shape c_np = c_np.reshape( (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)).transpose(0, 1, 3, 4, 2) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if target == "cuda" and not tvm.contrib.nvcc.have_int8( dev.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % target) with tvm.target.Target(target): C = topi.cuda.conv2d_NCHWc_int8(A, W, (stride, stride), padding, (dilation, dilation), "NCHW", dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.cuda.schedule_conv2d_NCHWc_int8([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for target in ["cuda"]: check_target(target)
def verify_conv2d_nchw_int8( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype="int8") W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype="int8") bias = te.placeholder((num_filter, 1, 1), name="bias", dtype="int8") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def verify_workload_padding(): _, _, out_height, out_width = get_const_tuple(c_np.shape) wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) # for testing functionality, # we choose arbitrary int32_lanes and num_int8_elements can divide the channel, # regardless of the performance. int32_lanes, num_int8_elements = num_filter, in_channel # check if tile_ow candidates are the factors of the right output weight. cfg = autotvm.get_config() fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements) ow_tile = np.prod(cfg["tile_ow"].size) tvm.testing.assert_allclose(ow_tile, out_width) def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if target == "cuda" and not tvm.contrib.nvcc.have_int8( dev.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % target) with tvm.target.Target(target): C = topi.cuda.conv2d_nchw_int8(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.cuda.schedule_conv2d_nchw_int8([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) verify_workload_padding() for target in ["cuda"]: check_target(target)
import tvm from tvm import te from tensorizer.intrinsics import INTRINSICS import numpy as np n, m, k = 128, 768, 3072 a = te.placeholder((n, k), 'float16') b = te.placeholder((k, m), 'float16') block_k = 4 rv = te.reduce_axis((0, k // block_k), ) def compute(xo, yo, z, xi, yi): x = xo * 16 + xi y = yo * 16 + yi lhs = a[x, z * (k // block_k) + rv].astype('float32') rhs = b[rv + z * (k // block_k), y].astype('float32') return te.sum(lhs * rhs, axis=[rv]) c = te.compute((n // 16, m // 16, block_k, 16, 16), compute) blkX = tvm.te.thread_axis('blockIdx.x') blkY = tvm.te.thread_axis('blockIdx.y') thrY = tvm.te.thread_axis('threadIdx.y') thrX = tvm.te.thread_axis('threadIdx.x') sch = te.create_schedule(c.op)
def test_dwarf_debug_information(): nn = 1024 n = tvm.runtime.convert(nn) A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = te.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def check_llvm_object(): if not tvm.runtime.enabled("llvm"): return if tvm.target.codegen.llvm_version_major() < 5: return if tvm.target.codegen.llvm_version_major() > 6: return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], "llvm") temp = util.tempdir() o_path = temp.relpath("temp.o") m.save(o_path) import re import shutil import subprocess import sys # Try the dwarfdump utility (OS X) if shutil.which("dwarfdump"): output = subprocess.check_output(["dwarfdump", o_path]) assert re.search(r"""DW_AT_name\\t\("fadd1"\)""", str(output)) assert re.search(r"""DW_AT_name\\t\("fadd2"\)""", str(output)) # Try gobjdump (OS X) if shutil.which("gobjdump"): output = subprocess.check_output(["gobjdump", "--dwarf", o_path]) assert re.search(r"""DW_AT_name.*fadd1""", str(output)) assert re.search(r"""DW_AT_name.*fadd2""", str(output)) # Try objdump (Linux) - Darwin objdump has different DWARF syntax. if shutil.which("objdump") and sys.platform != 'darwin': output = subprocess.check_output(["objdump", "--dwarf", o_path]) assert re.search(r"""DW_AT_name.*fadd1""", str(output)) assert re.search(r"""DW_AT_name.*fadd2""", str(output)) def check_llvm_ir(): if not tvm.runtime.enabled("llvm"): return if tvm.target.codegen.llvm_version_major() < 5: return if tvm.target.codegen.llvm_version_major() > 6: return # build two functions f2 = tvm.lower(s, [A, B, C], name="fadd1") f1 = tvm.lower(s, [A, B, C], name="fadd2") m = tvm.build([f1, f2], target="llvm -target=aarch64-linux-gnu") ll = m.get_source("ll") # On non-Darwin OS, don't explicitly specify DWARF version. import re assert not re.search(r""""Dwarf Version""""", ll) assert re.search(r"""llvm.dbg.value""", ll) # Try Darwin, require DWARF-2 m = tvm.build([f1, f2], target="llvm -target=x86_64-apple-darwin-macho") ll = m.get_source("ll") assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll) assert re.search(r"""llvm.dbg.value""", ll) check_llvm_object() check_llvm_ir()
def make_matrix_mul(shapeA, transposeA, shapeB, transposeB, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: use tvm.reduce_axis, tvm.sum""" """Hint: treat 4 cases of transposeA, transposeB separately""" """Hint: for tvm schedule, use split, reorder, vectorize, parallel""" """Hint: debug tvm schedule using tvm.lower""" A=te.placeholder(shapeA,dtype=dtype,name="A") B=te.placeholder(shapeB,dtype=dtype, name="B") def transpose(mat): return te.compute((mat.shape[1],mat.shape[0]),lambda i,j:mat[j][i]) AA=A if not transposeA else transpose(A) BB=B if not transposeB else transpose(B) k=te.reduce_axis((0,AA.shape[1]),name="k") C=te.compute((AA.shape[0],BB.shape[1]),lambda i,j:te.sum(AA[i][k]*BB[k][j],axis =k)) s=te.create_schedule(C.op) if tgt=="llvm": xo,yo,xi,yi=s[C].tile(C.op.axis[0],C.op.axis[1],32,32) k,=s[C].op.reduce_axis ko,ki=s[C].split(k,factor=4) s[C].reorder(xo,yo,ko,xi,yi,ki) # s[C].parallel(ki) if tgt=="cuda": if transposeA: xx1,xx2=s[AA].split(AA.op.axis[0],factor=32) s[AA].bind(xx1,te.thread_axis("blockIdx.x")) s[AA].bind(xx2,te.thread_axis("threadIdx.x")) if transposeB: yy1,yy2=s[BB].split(BB.op.axis[0],factor=32) s[BB].bind(yy1, te.thread_axis("blockIdx.y")) s[BB].bind(yy2, te.thread_axis("threadIdx.y")) x1,x2=s[C].split(C.op.axis[0],factor =32) y1,y2=s[C].split(C.op.axis[1],factor=32) # s[C].reorder(x1,y1,x2,y2) s[C].bind(x1,te.thread_axis("blockIdx.x")) s[C].bind(y1,te.thread_axis("blockIdx.y")) s[C].bind(x2,te.thread_axis("threadIdx.x")) s[C].bind(y2,te.thread_axis("threadIdx.y")) # bn = 32 # CC = s.cache_write(C, 'global') # xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) # s[CC].compute_at(s[C], yo) # xc, yc = s[CC].op.axis # k, = s[CC].op.reduce_axis # ko, ki = s[CC].split(k, factor=4) # s[CC].reorder(ko, xc, ki, yc) # s[CC].unroll(ki) # s[CC].vectorize(yc) # s[C].parallel(xo) # print(tvm.lower(s,[A,B,C],simple_mode=True)) f=tvm.build(s,[A,B,C],tgt,tgt_host,name=func_name) return f
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" A_=te.placeholder(shape,dtype=dtype,name="A_") A=te.placeholder(shape,dtype=dtype,name="A") #desined by myself k = te.reduce_axis((0, A.shape[1]), name="k") A_max = te.compute((A.shape[0],), lambda i: te.max(A[i, k], axis=k)) A_ex = te.compute(shape, lambda i, j: te.exp(A[i, j] - A_max[i])) k1 = te.reduce_axis((0, A.shape[1]), name="k1") A_ex_sum = te.compute((A.shape[0],), lambda i: te.sum(A_ex[i, k1], axis=k1)) A_logsoftmax = te.compute(shape, lambda i, j: te.log(A_ex[i, j] / A_ex_sum[i])) k2=te.reduce_axis((0,shape[1]),name="k2") A_logsoftmax_sum=te.compute((shape[0],0),lambda i:te.sum(A_logsoftmax[i,k2]*A_[i,k2],axis=k2)) k3=te.reduce_axis((0,shape[0]),name="k3") B=te.compute((1,),lambda i: te.sum(-A_logsoftmax_sum[k3],axis = k3)) B1=te.compute((1,), lambda i: B[i] / shape[0]) s=te.create_schedule(B1.op) if tgt=="cuda": #I'dont know why it can't work? s = te.create_schedule(B1.op) num_thread = 64 block_x = te.thread_axis("blockIdx.x") thread_x = te.thread_axis((0, num_thread), "threadIdx.x") s[A_ex].bind(A_ex.op.axis[0], block_x) s[A_max].bind(A_max.op.axis[0], block_x) k_ex_sum = A_ex_sum.op.reduce_axis[0] ko, ki = s[A_ex_sum].split(k_ex_sum, factor=num_thread) EF = s.rfactor(A_ex_sum, ki) s[A_ex_sum].bind(s[A_ex_sum].op.axis[0], block_x) s[A_ex_sum].bind(s[A_ex_sum].op.reduce_axis[0], thread_x) s[EF].compute_at(s[A_ex_sum], s[A_ex_sum].op.reduce_axis[0]) s[A_ex_sum].set_store_predicate(thread_x.var.equal(0)) tx, xi = s[A_logsoftmax].split(A_logsoftmax.op.axis[1], nparts=num_thread) s[A_logsoftmax].bind(A_logsoftmax.op.axis[0], block_x) s[A_logsoftmax].bind(tx, thread_x) k_logsoftmax_sum = A_logsoftmax_sum.op.reduce_axis[0] klso, klsi = s[A_logsoftmax_sum].split(k_logsoftmax_sum, factor=num_thread) lsEF = s.rfactor(A_logsoftmax_sum, klsi) s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.axis[0], block_x) s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.reduce_axis[0], thread_x) s[lsEF].compute_at(s[A_logsoftmax_sum], s[A_logsoftmax_sum].op.reduce_axis[0]) s[A_logsoftmax_sum].set_store_predicate(thread_x.var.equal(0)) k_B=B.op.reduce_axis[0] kbo,kbi=s[B].split(k_B,factor=num_thread) bEF=s.rfactor(B,kbi) s[B].bind(s[B].op.reduce_axis[0],thread_x) s[bEF].compute_at(s[B],s[B].op.reduce_axis[0]) s[B].set_store_predicate(block_x.var.equal(0)) s[B1].set_store_predicate(block_x.var.equal(0)) print(tvm.lower(s, [A, A_,B1], simple_mode=True)) f=tvm.build(s,[A,A_,B1],tgt,tgt_host,name=func_name) return f
def verify_resize3d( batch, in_channel, in_depth, in_height, in_width, out_depth, out_height, out_width, layout="NCDHW", coordinate_transformation_mode="half_pixel", method="trilinear", ): if layout == "NCDHW": A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name="A", dtype="float32") dtype = A.dtype out_shape = (batch, in_channel, out_depth, out_height, out_width) a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height, in_width)).astype(dtype) elif layout == "NDHWC": A = te.placeholder((batch, in_depth, in_height, in_width, in_channel), name="A", dtype="float32") dtype = A.dtype out_shape = (batch, out_depth, out_height, out_width, in_channel) a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width, in_channel)).astype(dtype) else: raise NotImplementedError("Layout not supported {} ".format(layout)) B = topi.image.resize3d( A, (out_depth, out_height, out_width), layout=layout, coordinate_transformation_mode=coordinate_transformation_mode, method=method, ) if method == "trilinear": b_np = tvm.topi.testing.trilinear_resize3d_python( a_np, (out_depth, out_height, out_width), layout, coordinate_transformation_mode) else: scale_d = out_depth / in_depth scale_h = out_height / in_height scale_w = out_width / in_width b_np = tvm.topi.testing.upsampling3d_python( a_np, (scale_d, scale_h, scale_w), layout) def check_target(target, dev): print("Running on target: %s" % target) with tvm.target.Target(target): s = tvm.topi.testing.get_injective_schedule(target)(B) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) for target, dev in tvm.testing.enabled_targets(): check_target(target, dev)
def verify_crop_and_resize( image_shape, np_boxes, np_box_indices, np_crop_size, layout="NHWC", method="bilinear", extrapolation_value=0.0, ): images = te.placeholder(image_shape, name="images", dtype="float32") np_images = np.random.uniform(size=image_shape).astype("float32") boxes = te.placeholder(np_boxes.shape, name="boxes", dtype="float32") box_ind = te.placeholder(np_box_indices.shape, name="box_ind", dtype="int32") batch = len(np_box_indices) target_height, target_width = np_crop_size[0], np_crop_size[1] if layout == "NHWC": channel = image_shape[3] out_shape = (batch, target_height, target_width, channel) elif layout == "NCHW": channel = image_shape[1] out_shape = (batch, channel, target_height, target_width) else: raise NotImplementedError( "Layout {} is not supported.".format(layout)) out = topi.image.crop_and_resize( images, boxes, box_ind, np_crop_size, layout=layout, method=method, extrapolation_value=extrapolation_value, ) baseline_np = tvm.topi.testing.crop_and_resize_python( np_images, np_boxes, np_box_indices, np_crop_size, layout, method, extrapolation_value) def check_target(target, dev): print("Running on target: %s" % target) with tvm.target.Target(target): s = tvm.topi.testing.get_injective_schedule(target)(out) tvm_images = tvm.nd.array(np_images, dev) tvm_boxes = tvm.nd.array(np_boxes, dev) tvm_indices = tvm.nd.array(np_box_indices, dev) tvm_out = tvm.nd.array(np.zeros(out_shape, dtype="float32"), dev) f = tvm.build(s, [images, boxes, box_ind, out], target, name="crop_and_resize") f(tvm_images, tvm_boxes, tvm_indices, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), baseline_np, rtol=1e-3, atol=1e-3) for target, dev in tvm.testing.enabled_targets(): check_target(target, dev)
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"): # Build the logic and compile the function A = te.placeholder(shape=in_shape, name="A", dtype=dtype) A1 = topi.sqrt(topi.exp(A)) out_dtype = dtype if type == "sum": B = topi.sum(A1, axis=axis, keepdims=keepdims) elif type == "all": B = topi.all(A, axis=axis, keepdims=keepdims) elif type == "any": B = topi.any(A, axis=axis, keepdims=keepdims) elif type == "max": B = topi.max(A1, axis=axis, keepdims=keepdims) elif type == "min": B = topi.min(A1, axis=axis, keepdims=keepdims) elif type == "argmax": B = topi.argmax(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" elif type == "argmin": B = topi.argmin(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" else: raise NotImplementedError def check_device(device, dev): print("Running on target: %s" % device) with tvm.target.Target(device): s = tvm.topi.testing.get_reduce_schedule(device)(B) foo = tvm.build(s, [A, B], device, name=type) # Test if dtype == "bool": in_npy_map = in_npy = np.random.choice([True, False], size=in_shape) else: in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype) in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype) if type == "sum": out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims) elif type == "all" and dtype == "bool": out_npy = in_npy_map.all(axis=axis, keepdims=keepdims) elif type == "any" and dtype == "bool": out_npy = in_npy_map.any(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy_map.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy_map.min(axis=axis, keepdims=keepdims) elif type == "argmax": out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims) elif type == "argmin": out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, device=dev) out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) if type == "argmax" or type == "argmin": out_tvm_indices = out_tvm.numpy() if keepdims: out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis) if axis is None: out_tvm_val = in_npy_map.ravel()[out_tvm_indices] else: other_indices = tuple( np.indices(in_shape[0:axis] + in_shape[(axis + 1):])) sel_indices = other_indices[0:axis] + ( out_tvm_indices, ) + other_indices[axis:] out_tvm_val = in_npy_map[sel_indices] if type == "argmax": tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1e-3, 1e-3) elif type == "argmin": tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1e-3, 1e-3) else: tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3) for device, dev in tvm.testing.enabled_targets(): check_device(device, dev)
def dot_16x1x16_uint8_int8_int32_skylake(): """ Int8 dot product by every 4 elements using AVX512 Skylake instructions. This function takes two arrays of uint8 and int8 datatype -- data[4] and kernel[16][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[16] of int32 datatype. The pseudo code is as follows. .. code-block:: c void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < 16; i++){ output[i] = 0; for (int k = 0; k < 4; k++){ output[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in an AVX512 vector register and the data[4] is broadcasted to another AVX512 vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Returns ------- intrin : TensorIntrin The Skylake int8 TensorIntrin that can be used in tensorizing schedule """ int32_lanes = 16 # 16 int32 lanes in AVX512 num_int8_elements = 4 # 4 int8 elements in int32 data = te.placeholder((num_int8_elements, ), dtype='uint8', name='data') kernel = te.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel') k = te.reduce_axis((0, num_int8_elements), name='k') C = te.compute( (int32_lanes, ), lambda i: te.sum( data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k), name="C") a_buffer = tvm.tir.decl_buffer(data.shape, dtype='uint8', name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype='int8', name="b_buffer", offset_factor=1, strides=[te.var('ldw'), 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.tir.ir_builder.create() if index == 1: ib.emit(outs[0].vstore(0, tvm.tir.const(0, 'int32x16'))) return ib.get() a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.tir.call_intrin('int32', 'tir.reinterpret', a_int8) vec_ai32 = re_int32.astype('int32x16') vec_a = tvm.tir.call_intrin('int8x64', 'tir.reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], "int8x64") vec_one = tvm.tir.const(1, "int16x32") pair_reduction = tvm.tir.call_llvm_pure_intrin( 'int16x32', 'llvm.x86.avx512.pmaddubs.w.512', tvm.tir.const(0, 'uint32'), vec_a, vec_b) quad_reduction = tvm.tir.call_llvm_pure_intrin( 'int32x16', 'llvm.x86.avx512.pmaddw.d.512', tvm.tir.const(0, 'uint32'), pair_reduction, vec_one) if index == 0: ib.emit(outs[0].vstore(0, quad_reduction)) else: ib.emit(outs[0].vstore( 0, quad_reduction + outs[0].vload([0], 'int32x16'))) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin(C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer }, default_buffer_params=buffer_params)
def verify_conv2d_nchw( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name='A') W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W') bias = te.placeholder((num_filter, 1, 1), name='bias') a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv2d_nchw_winograd_implement) C = fcompute(A, W, stride, padding, dilation, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol) for device in devices: check_device(device)
visitor is implemented. - How a Schedule is lowered to either an IRModule class or a LLVM module. Otherwise, take a look at ``python/tvm/build_module.py`` to get some basics. """ import tvm from tvm import te import numpy as np ###################################################################### # We first write a very simple vector add and build it with the default schedule. Then, we use # our customized lowering pass to manipulate the IR directly instead of using schedule primitives. # n = tvm.tir.const(128, "int32") a = te.placeholder((n, ), name="a") b = te.placeholder((n, ), name="b") c = te.compute((n, ), lambda i: a[i] + b[i], name='c') sch = te.create_schedule(c.op) ir = tvm.lower(sch, [a, b, c]) print(ir) ###################################################################### # Writing a Pass # -------------- # Essentially, an "IR transformation pass" is a function which maps a statement to a new statement. # Thus, we define this vectorize function and implement it step by step. # ######################################################################
def test_convolution_inference(): BATCH = 8 IH = 48 IW = 48 IC = 16 OC = 16 K = 3 PAD = 1 STRIDE = 1 OH = (IH + 2 * PAD - K) + 1 OW = (IW + 2 * PAD - K) + 1 dshape = (BATCH, IC, IH, IW) kshape = (OC, IC, K, K) bshape = (OC, ) oshape = (BATCH, OC, OH, OW) data = te.placeholder(dshape, name="data") kernel = te.placeholder(kshape, name="kernel") bias = te.placeholder(bshape, name="bias") def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=True): if not tvm.get_global_func( "tvm.contrib.nnpack.fully_connected_inference", True): pytest.skip("extern function is not available") if not nnpack.is_available(): pytest.skip("nnpack is not available") ctx = tvm.cpu(0) output = nnpack.convolution_inference( data, kernel, bias if with_bias else None, [PAD, PAD, PAD, PAD], [STRIDE, STRIDE], algorithm=algorithm, ) s = te.create_schedule(output.op) f = tvm.build(s, [data, kernel, bias, output], target) na = np.random.uniform(size=dshape).astype(data.dtype) nb = np.random.uniform(size=kshape).astype(kernel.dtype) nc = np.zeros(bshape, dtype=bias.dtype) ta = tvm.nd.array(na, ctx) tb = tvm.nd.array(nb, ctx) tc = tvm.nd.array(nc, ctx) td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx) f(ta, tb, tc, td) nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1) tvm.testing.assert_allclose(td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5) for algorithm in [ nnpack.ConvolutionAlgorithm.AUTO, nnpack.ConvolutionAlgorithm.FFT_8x8, nnpack.ConvolutionAlgorithm.FFT_16x16, nnpack.ConvolutionAlgorithm.WT_8x8, nnpack.ConvolutionAlgorithm.IMPLICIT_GEMM, nnpack.ConvolutionAlgorithm.WT_8x8_FP16, ]: for with_bias in [True, False]: verify(algorithm=algorithm, with_bias=with_bias)
def test_dependent_output_shape(): n, m, x = te.size_var('n'), te.size_var('m'), te.size_var('x') A = te.placeholder((n, m)) B = te.compute((m, n//x), lambda i, j: A[i,j] , name='B') s = te.create_schedule(B.op) mod = tvm.build(s, [A, B, x])
def verify_resize( batch, in_channel, in_height, in_width, out_height, out_width, layout="NCHW", coord_trans="align_corners", method="bilinear", ): if layout == "NCHW": A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype="float32") dtype = A.dtype out_shape = (batch, in_channel, out_height, out_width) a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype) elif layout == "NHWC": A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="float32") dtype = A.dtype out_shape = (batch, out_height, out_width, in_channel) a_np = np.random.uniform(size=(batch, in_height, in_width, in_channel)).astype(dtype) else: raise NotImplementedError("Layout not supported {} ".format(layout)) B = topi.image.resize( A, (out_height, out_width), layout=layout, coordinate_transformation_mode=coord_trans, method=method, ) if method == "bilinear": b_np = tvm.topi.testing.bilinear_resize_python(a_np, (out_height, out_width), layout, coord_trans) else: # TODO: Nearest neighbor case doesn't do anything with coordinate transform mode, and also # nearest_neighbors and align_corners combination in topi doesn't match the output of this # function. scale_h = out_height / in_height scale_w = out_width / in_width b_np = tvm.topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout) def check_target(target, dev): print("Running on target: %s" % target) with tvm.target.Target(target): s = tvm.topi.testing.get_injective_schedule(target)(B) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev) f = tvm.build(s, [A, B], target) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3) for target, dev in tvm.testing.enabled_targets(): check_target(target, dev)
import tvm from tvm import te n = 1024 dtype = "float32" A = te.placeholder((n, n), dtype=dtype, name='A') k = te.reduce_axis((0, n), name='k') B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name='B') s = te.create_schedule(B.op) print(tvm.lower(s, [A, B], simple_mode=True)) print("---------cutting line---------") AA = s.cache_read(A, "shared", [B]) print(tvm.lower(s, [A, B], simple_mode=True))
def verify_broadcast_binary_ele(lhs_shape, rhs_shape, ftopi, fnumpy, lhs_min=-100, lhs_max=100, rhs_min=-100, rhs_max=100, dtype="float32"): # Build the logic and compile the function A = (te.var("A", dtype=dtype) if lhs_shape is None else te.placeholder(shape=lhs_shape, name="A", dtype=dtype)) B = (te.var("B", dtype=dtype) if rhs_shape is None else te.placeholder(shape=rhs_shape, name="B", dtype=dtype)) C = ftopi(A, B) if isinstance(A, tvm.tir.PrimExpr) and isinstance(B, tvm.tir.PrimExpr): assert(isinstance(C, tvm.tir.PrimExpr)) return def gen_operand(shape, low, high, ctx): if shape is None: npy = float(np.random.uniform(low=low, high=high)) if dtype.startswith('int'): npy = int(npy) nd = npy else: npy = np.random.uniform(low=low, high=high, size=shape).astype(dtype) nd = tvm.nd.array(npy, ctx) return npy, nd def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.testing.get_broadcast_schedule(device)(C) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__) lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, ctx) rhs_npy, rhs_nd = gen_operand(rhs_shape, rhs_min, rhs_max, ctx) out_npy = fnumpy(lhs_npy, rhs_npy) if fnumpy == np.floor_divide: # avoid check too close to X.5 and X.0 # FIXME: floor_divide(94.90735, 0.6731018) behaves as floor(div(94.90735, 0.6731018)) # However the result is somehow incorrect - need to further investigate. # And looks like numpy's floor_div(a,b) is implemented different from floor(div(a,b)) mask = np.logical_or(np.abs(np.abs(np.fmod(lhs_npy / rhs_npy, 1)) - 0.5) < 1e-6, np.abs(np.fmod(lhs_npy / rhs_npy, 1)) < 1e-6) if mask.any(): lhs_npy = lhs_npy + mask * 1e-3 * rhs_npy lhs_npy = lhs_npy.astype(dtype) lhs_nd = tvm.nd.array(lhs_npy, ctx) if lhs_shape is not None else lhs_npy.item() out_npy = fnumpy(lhs_npy, rhs_npy) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx) foo(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4) for target in get_all_backend(): check_device(target) check_device("sdaccel")
def make_matrix_softmax(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: use tvm.reduce_axis, tvm.sum, tvm.max, tvm.exp""" """Hint: do not reuse the same reduction axis j.""" """Hint: implement the following version for better stability e_x = np.exp(x - np.max(x)) softmax(x)= e_x / e_x.sum() """ A=te.placeholder(shape,dtype = dtype, name="A") ''' #desined by myself k=te.reduce_axis((0,A.shape[1]),name="k") A_max=te.compute((A.shape[0],),lambda i:te.max(A[i,k],axis=k)) A_ex=te.compute(shape,lambda i,j:te.exp(A[i,j]-A_max[i])) k1=te.reduce_axis((0,A.shape[1]),name="k1") A_ex_sum=te.compute((A.shape[0],),lambda i:te.sum(A_ex[i,k1],axis = k1)) B=te.compute(shape,lambda i,j:A_ex[i,j]/A_ex_sum[i]) s=te.create_schedule(B.op) if tgt=="cuda": s[B].bind(B.op.axis[1],te.thread_axis("threadIdx.x")) s[A_ex_sum].bind(k1,te.thread_axis("threadIdx.x")) s[A_ex].bind(A_ex.op.axis[1],te.thread_axis("threadIdx.x")) s[A_max].bind(k,te.thread_axis("threadIdx.x")) # print (tvm.lower(s,[A,B],simple_mode=True)) ''' #use topi B=topi.nn.softmax(A,axis=1) if tgt=="llvm": s = te.create_schedule(B.op) elif tgt=="cuda": # s=topi.cuda.schedule_softmax(B) s=te.create_schedule(B.op) softmax = B expsum = softmax.op.input_tensors[1] exp = softmax.op.input_tensors[0] max_elem = s[exp].op.input_tensors[1] num_thread = 64 block_x = te.thread_axis("blockIdx.x") thread_x = te.thread_axis((0, num_thread), "threadIdx.x") s[exp].bind(exp.op.axis[0], block_x) s[max_elem].bind(max_elem.op.axis[0], block_x) k = expsum.op.reduce_axis[0] ko, ki = s[expsum].split(k, factor=num_thread) EF = s.rfactor(expsum, ki) s[expsum].bind(s[expsum].op.axis[0], block_x) s[expsum].bind(s[expsum].op.reduce_axis[0], thread_x) s[EF].compute_at(s[expsum], s[expsum].op.reduce_axis[0]) s[expsum].set_store_predicate(thread_x.var.equal(0)) tx, xi = s[softmax].split(softmax.op.axis[1], nparts=num_thread) s[softmax].bind(softmax.op.axis[0], block_x) s[softmax].bind(tx, thread_x) print(tvm.lower(s, [A, B], simple_mode=True)) else: s=None f=tvm.build(s,[A,B],tgt,tgt_host,name=func_name) return f
def schedule_nhwc_tensorcore_cuda(cfg, s, Conv): """Schedule tensorcore template""" kh, kw, ic = s[Conv].op.reduce_axis out_dtype = Conv.dtype trans_paddata, kernel = s[Conv].op.input_tensors in_dtype = trans_paddata.dtype batch, _, _, _ = get_const_tuple(Conv.shape) _, _, _, out_channels = get_const_tuple(kernel.shape) paddata = s[trans_paddata].op.input_tensors # inline the pad and dtype transform s[trans_paddata].compute_inline() s[kernel].compute_inline() s[paddata[0]].compute_inline() # Designate the memory hierarchy AS = s.cache_read(trans_paddata, "shared", [Conv]) WS = s.cache_read(kernel, "shared", [Conv]) AF = s.cache_read(AS, "wmma.matrix_a", [Conv]) WF = s.cache_read(WS, "wmma.matrix_b", [Conv]) ConvF = s.cache_write(Conv, "wmma.accumulator") if Conv.op in s.outputs: output = Conv ConvS = s.cache_read(ConvF, "shared", [Conv]) OL = ConvS else: output = s.outputs[0].output(0) s[Conv].set_scope("shared") OL = Conv # Schedule for autotvm cfg.define_knob("block_row_warps", [1, 2, 4]) cfg.define_knob("block_col_warps", [1, 2, 4]) cfg.define_knob("warp_row_tiles", [1, 2, 4]) cfg.define_knob("warp_col_tiles", [1, 2, 4]) cfg.define_knob("chunk", [1, 2, 4, 8]) cfg.define_knob("offset", [0, 8]) cfg.define_knob("vector_width", [1, 2, 4, 8]) if batch % 16 == 0 and out_channels % 16 == 0: cfg.define_knob("wmma_m", [16, 8, 32]) elif batch % 8 == 0 and out_channels % 32 == 0: cfg.define_knob("wmma_m", [8, 16, 32]) elif batch % 32 == 0 and out_channels % 8 == 0: cfg.define_knob("wmma_m", [32, 16, 8]) # fallback support target = tvm.target.Target.current() if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( target.kind.name, target.model, "conv2d_nhwc_tensorcore.cuda") cfg.fallback_with_reference_log(ref_log) block_row_warps = cfg["block_row_warps"].val block_col_warps = cfg["block_col_warps"].val warp_row_tiles = cfg["warp_row_tiles"].val warp_col_tiles = cfg["warp_col_tiles"].val chunk = cfg["chunk"].val offset = cfg["offset"].val wmma_m = cfg["wmma_m"].val vector_width = cfg["vector_width"].val wmma_k = 16 if wmma_m == 16: wmma_n = 16 elif wmma_m == 8: wmma_n = 32 elif wmma_m == 32: wmma_n = 8 warp_size = 32 block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") block_z = te.thread_axis("blockIdx.z") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") # Define the intrin strides def get_strides(extents): return [np.prod(extents[i:]).tolist() for i in range(len(extents))] AS_align = chunk * wmma_k + offset WS_align = warp_col_tiles * block_col_warps * wmma_n + offset block_factor_n = wmma_m * warp_row_tiles * block_row_warps block_factor_o = wmma_n * warp_col_tiles * block_col_warps CS_align = block_factor_o + offset AS_strides = get_strides([1, 1, AS_align, 1]) AL_strides = get_strides([1, 1, wmma_k, 1]) WS_strides = get_strides([WS_align, 1]) WL_strides = get_strides([wmma_n * warp_col_tiles, 1]) CL_strides = get_strides([1, 1, wmma_n * warp_col_tiles, 1]) CS_strides = get_strides([1, 1, CS_align, 1]) # Schedule for output nc, hc, wc, oc = output.op.axis block_k = s[output].fuse(hc, wc) s[output].bind(block_k, block_z) block_i, nc = s[output].split(nc, factor=block_factor_n) block_j, oc = s[output].split(oc, factor=block_factor_o) s[output].reorder(block_k, block_i, block_j, nc, oc) t = s[output].fuse(nc, oc) t, ti = s[output].split(t, factor=vector_width) t, tx = s[output].split(t, factor=warp_size) t, ty = s[output].split(t, factor=block_row_warps) t, tz = s[output].split(t, factor=block_col_warps) s[output].bind(block_i, block_x) s[output].bind(block_j, block_y) s[output].bind(tz, thread_z) s[output].bind(ty, thread_y) s[output].bind(tx, thread_x) s[output].vectorize(ti) # Schedule wmma store s[OL].compute_at(s[output], block_j) nc, hc, wc, oc = OL.op.axis s[OL].reorder(hc, wc, nc, oc) s[OL].storage_align(wc, CS_align - 1, CS_align) oc, ooc = s[OL].split(oc, factor=wmma_n) oc, oci = s[OL].split(oc, factor=warp_col_tiles) _, oc = s[OL].split(oc, factor=block_col_warps) nc, nnc = s[OL].split(nc, factor=wmma_m) nc, nci = s[OL].split(nc, factor=warp_row_tiles) _, nc = s[OL].split(nc, factor=block_row_warps) s[OL].reorder(nc, oc, nci, oci, nnc, ooc) s[OL].bind(nc, thread_y) s[OL].bind(oc, thread_z) # Schedule wmma computation s[ConvF].compute_at(s[OL], oc) n, h, w, o = ConvF.op.axis n, nnf = s[ConvF].split(n, factor=wmma_m) o, oof = s[ConvF].split(o, factor=wmma_n) ic, ii = s[ConvF].split(ic, factor=wmma_k) ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(kh, kw, ko, ki, n, o, nnf, oof, ii) s[AF].compute_at(s[ConvF], ki) s[WF].compute_at(s[ConvF], ki) # Schedule wmma load n, h, w, i = AF.op.axis n, nn = s[AF].split(n, factor=wmma_m) i, ii = s[AF].split(i, factor=wmma_k) s[AF].reorder(n, i, nn, ii) kh, kw, i, o = WF.op.axis i, ii = s[WF].split(i, factor=wmma_k) o, oo = s[WF].split(o, factor=wmma_n) s[WF].reorder(o, i, oo) s[WF].reorder(i, o, ii, oo) s[WS].compute_at(s[ConvF], ko) s[AS].compute_at(s[ConvF], ko) # Schedule for data's share memory n, h, w, i = AS.op.axis s[AS].reorder(h, w, n, i) s[AS].storage_align(w, AS_align - 1, AS_align) t = s[AS].fuse(n, i) t, ti = s[AS].split(t, factor=vector_width) t, tx = s[AS].split(t, factor=warp_size) t, ty = s[AS].split(t, factor=block_row_warps) _, tz = s[AS].split(t, factor=block_col_warps) s[AS].bind(ty, thread_y) s[AS].bind(tz, thread_z) s[AS].bind(tx, thread_x) s[AS].vectorize(ti) # Schedule for kernel's share memory kh, kw, ic, o = WS.op.axis t = s[WS].fuse(ic, o) s[WS].storage_align(ic, WS_align - 1, WS_align) t, ti = s[WS].split(t, factor=vector_width) t, tx = s[WS].split(t, factor=warp_size) t, ty = s[WS].split(t, factor=block_row_warps) _, tz = s[WS].split(t, factor=block_col_warps) s[WS].bind(ty, thread_y) s[WS].bind(tz, thread_z) s[WS].bind(tx, thread_x) s[WS].vectorize(ti) shape = (wmma_m, wmma_n, wmma_k) # tensorize the wmma process AS_shape = (wmma_m, 1, 1, wmma_k) AL_shape = (wmma_m, 1, 1, wmma_k) WS_shape = (wmma_k, wmma_n) WL_shape = (wmma_k, wmma_n) CL_shape = (wmma_m, 1, 1, wmma_n) CS_shape = (wmma_m, 1, 1, wmma_n) AL_gemm = te.placeholder(AL_shape, name="A", dtype=in_dtype) WL_gemm = te.placeholder(WL_shape, name="B", dtype=in_dtype) k_gemm = te.reduce_axis((0, wmma_k), name="k") CL_compute = te.compute( CL_shape, lambda ii, t0, t1, jj: te.sum( AL_gemm[ii, t0, t1, k_gemm].astype(out_dtype) * WL_gemm[k_gemm, jj] .astype(out_dtype), axis=k_gemm, ), name="C", ) s[AF].tensorize( nn, intrin_wmma_load_matrix_A(AL_strides, AS_strides, shape, "row_major", AS_shape, AL_shape, in_dtype), ) s[WF].tensorize( ii, intrin_wmma_load_matrix_W(WL_strides, WS_strides, shape, "row_major", WS_shape, WL_shape, in_dtype), ) s[OL].tensorize( nnc, intrin_wmma_store_matrix(CS_strides, CL_strides, shape, out_dtype, CL_shape, CS_shape)) s[ConvF].tensorize( nnf, intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides, WL_strides, CL_strides, shape), ) N, OH, OW, CO = get_const_tuple(output.shape) KH, KW, CI, _ = get_const_tuple(kernel.shape) cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
def verify_conv1d_integration(): batch_size = 1 num_channel = 1 num_filter = 1 # Note: TVM doesn't have a separate op for 1D convolution, so we use conv2d instead. # We set height=1 to indicate that convolution is really 1D. stride = (1, 1) dilate = (1, 1) padding = (0, 0) kernel_size = (1, 3) input_window_size = (1, 10) inc_input_size = (1, 2) context_size = (1, 4) inc_output_size = (1, 2) output_window_size = (1, 8) num_iteration = 20 buffer_axis = 3 kernel_shape = (num_filter, num_channel, kernel_size[0], kernel_size[1]) input_window_shape = (batch_size, num_channel, input_window_size[0], input_window_size[1]) inc_input_shape = (batch_size, num_channel, inc_input_size[0], inc_input_size[1]) inc_output_shape = (batch_size, num_filter, inc_output_size[0], inc_output_size[1]) context_shape = (batch_size, num_channel, context_size[0], context_size[1]) output_window_shape = (batch_size, num_filter, output_window_size[0], output_window_size[1]) # Rule: Convolution of Tensor[context_shape] and Tensor[kernel_shape] # produces Tensor[inc_input_shape] dtype = "float32" inc_input = te.placeholder(inc_input_shape, name="inc_input", dtype=dtype) input_window = te.placeholder(input_window_shape, name="input_window", dtype=dtype) context = te.placeholder(context_shape, name="context", dtype=dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=dtype) inc_output = te.placeholder(inc_input_shape, name="inc_output", dtype=dtype) output_window = te.placeholder(output_window_shape, name="output_window", dtype=dtype) # Use memoize, pickle the test data for next time use @memoize("topi.tests.test_fifo_buffer_conv1d_integration") def get_data(): # Generate [num_iteration] slices of input inc_input_np = np.random.uniform( size=tuple([num_iteration] + list(inc_input_shape))).astype(dtype) input_window_np = np.zeros(input_window_shape, dtype=dtype) kernel_np = np.random.uniform(size=kernel_shape).astype(dtype) context_np = np.zeros(context_shape, dtype=dtype) output_window_np = np.zeros(output_window_shape, dtype=dtype) return (inc_input_np, input_window_np, kernel_np, context_np, output_window_np) # Get the test data inc_input_np, input_window_np, kernel_np, context_np, output_window_np = get_data( ) def check_device(device, ctx): print(" Running on target: {}".format(device)) conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement( device) with tvm.target.Target(device): out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis) s = tvm.topi.testing.get_injective_schedule(device)([out]) update_context = tvm.build(s, [inc_input, context, out], device, name="update_context") out = conv2d_nchw(context, kernel, stride, padding, dilate, dtype) s = schedule_conv2d_nchw([out]) conv2d_inc = tvm.build(s, [context, kernel, out], device, name="conv2d_inc") out = topi.nn.fifo_buffer(inc_output, output_window, axis=buffer_axis) s = tvm.topi.testing.get_injective_schedule(device)([out]) update_output_window = tvm.build(s, [inc_output, output_window, out], device, name="update_output_window") out = topi.nn.fifo_buffer(inc_input, input_window, axis=buffer_axis) s = tvm.topi.testing.get_injective_schedule(device)([out]) update_input_window = tvm.build(s, [inc_input, input_window, out], device, name="update_input_window") out = conv2d_nchw(input_window, kernel, stride, padding, dilate, dtype) s = schedule_conv2d_nchw([out]) conv2d = tvm.build(s, [input_window, kernel, out], device, name="conv2d") input_window_tvm = tvm.nd.array(input_window_np, ctx=ctx) new_input_window_tvm = tvm.nd.empty(shape=input_window_shape, ctx=ctx, dtype=dtype) kernel_tvm = tvm.nd.array(kernel_np, ctx=ctx) context_tvm = tvm.nd.array(context_np, ctx=ctx) new_context_tvm = tvm.nd.empty(shape=context_shape, ctx=ctx, dtype=dtype) inc_output_tvm = tvm.nd.empty(shape=inc_output_shape, ctx=ctx, dtype=dtype) output_window_tvm = tvm.nd.array(output_window_np, ctx=ctx) new_output_window_tvm = tvm.nd.empty(shape=output_window_shape, ctx=ctx, dtype=dtype) output_window_ref_tvm = tvm.nd.empty(shape=output_window_shape, ctx=ctx, dtype=dtype) for i in range(num_iteration): # Take i-th slice of inc_input_np inc_input_tvm = tvm.nd.array(inc_input_np[i], ctx=ctx) # Compute new output window incrementally, using the FIFO buffer op update_context(inc_input_tvm, context_tvm, new_context_tvm) conv2d_inc(new_context_tvm, kernel_tvm, inc_output_tvm) update_output_window(inc_output_tvm, output_window_tvm, new_output_window_tvm) context_tvm = new_context_tvm output_window_tvm = new_output_window_tvm # Compute full input window, so that we have a baseline update_input_window(inc_input_tvm, input_window_tvm, new_input_window_tvm) input_window_tvm = new_input_window_tvm conv2d(input_window_tvm, kernel_tvm, output_window_ref_tvm) # Incrementally updating the output window should be equivalent to computing it from # scratch using the input window tvm.testing.assert_allclose(output_window_tvm.asnumpy(), output_window_ref_tvm.asnumpy()) for device, ctx in tvm.testing.enabled_targets(): check_device(device, ctx)
def test_tile_nd(): input = te.placeholder((12, 12), dtype="uint8", name="input") out = topi.nn.relu(input) sch = te.create_schedule([out.op]) outer_iters, inner_iters = tile_nd(sch, out, (3, 4)) assert tuple(sch[out].leaf_iter_vars) == (*outer_iters, *inner_iters)
def check(start, end, dstart, dend, dtype, floor_div=False): div = tvm.te.floordiv if floor_div else tvm.tir.truncdiv mod = tvm.te.floormod if floor_div else tvm.tir.truncmod # A are dividends, B are divisors. Note that we add 1 to make include end in the range. A = te.placeholder((end - start + 1,), name="A", dtype=dtype) B = te.placeholder((dend - dstart + 1,), name="B", dtype=dtype) # We clip values with min and max so that simplifiers know the ranges of values clipa = lambda x: tvm.te.min(tvm.tir.const(end, dtype), tvm.te.max(tvm.tir.const(start, dtype), x)) clipb = lambda x: tvm.te.min(tvm.tir.const(dend, dtype), tvm.te.max(tvm.tir.const(dstart, dtype), x)) # If the range is just a single point, use the constant itself if start == end: clipa = lambda x: tvm.tir.const(start, dtype) if dstart == dend: clipb = lambda x: tvm.tir.const(dstart, dtype) # D are division results and M are modulo results [D, M] = te.compute((end - start + 1, dend - dstart + 1), lambda i, j: (div(clipa(A[i]), clipb(B[j])), mod(clipa(A[i]), clipb(B[j])))) s = te.create_schedule([D.op, M.op]) f = tvm.build(s, [A, B, D, M], "llvm") # Fill input arrays with values A_arr = tvm.nd.empty((end - start + 1,), dtype) B_arr = tvm.nd.empty((dend - dstart + 1,), dtype) A_arr.copyfrom(np.arange(start, end + 1, dtype=dtype)) B_np = np.arange(dstart, dend + 1, dtype=dtype) # If the range of the divisor contains 0, replace it with 1 to avoid division by zero if dend >= 0 and dstart <= 0: B_np[-dstart] = 1 B_arr.copyfrom(B_np) D_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype) M_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype) # Run the function and convert the results to numpy f(A_arr, B_arr, D_arr, M_arr) D_arr = D_arr.asnumpy() M_arr = M_arr.asnumpy() # This helper just prints additional info on failure def _show_info(): print("dtype: {}".format(dtype)) print("dividend range: [{}, {}]".format(start, end)) print("divisor range: [{}, {}]".format(dstart, dend)) lowered = tvm.lower(s, [A, B, D, M], simple_mode=True) print("Lowered code:") print(lowered) # Check that the computed values are correct for i in range(start, end + 1): for j in range(dstart, dend + 1): if j == 0: continue if floor_div: dref = i // j mref = i % j else: dref = int(float(i) / j) mref = int(math.fmod(i, j)) if D_arr[i - start, j - dstart] != dref: _show_info() raise AssertionError("Incorrect division result: {}({}, {}) is {} " "but should be {}".format(div.__name__, i, j, D_arr[i - start, j - dstart], dref)) if M_arr[i - start, j - dstart] != mref: _show_info() raise AssertionError("Incorrect modulo result: {}({}, {}) is {} " "but should be {}".format(mod.__name__, i, j, M_arr[i - start, j - dstart], mref))
def test_schedule_pragmas_for_const(): input = te.placeholder((12, 12), dtype="uint8", name="input") const = te.compute((), lambda: 2) add = topi.add(input, const) sch = te.create_schedule([add.op]) schedule_pragmas(sch)
def verify_conv2d_NHWC_gemm_int8( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8") W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8") bias = te.placeholder((num_filter, ), name="bias", dtype="int8") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1)) c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Running on target: %s" % target) with tvm.target.Target(target): C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved( A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) check_target("llvm")
def verify_non_max_suppression( np_data, np_valid_count, np_indices, np_result, np_indices_result, max_output_size, iou_threshold, force_suppress, top_k, coord_start, score_index, id_index, ): dshape = np_data.shape batch, num_anchors, _ = dshape indices_dshape = (batch, num_anchors) data = te.placeholder(dshape, name="data") valid_count = te.placeholder((batch, ), dtype="int32", name="valid_count") indices = te.placeholder((batch, num_anchors), dtype="int32", name="indices") def check_device(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Running on target: %s" % target) with tvm.target.Target(target): fcompute, fschedule = tvm.topi.testing.dispatch( target, _nms_implement) out = fcompute( data, valid_count, indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start=coord_start, score_index=score_index, id_index=id_index, return_indices=False, ) indices_out = fcompute( data, valid_count, indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start=coord_start, score_index=score_index, id_index=id_index, return_indices=True, ) s = fschedule(out) indices_s = fschedule(indices_out) tvm_data = tvm.nd.array(np_data, dev) tvm_valid_count = tvm.nd.array(np_valid_count, dev) tvm_indices = tvm.nd.array(np_indices, dev) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev) f = tvm.build(s, [data, valid_count, indices, out], target) f(tvm_data, tvm_valid_count, tvm_indices, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4) tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), dev) f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], target) f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out) tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4) for target in ["llvm", "cuda", "opencl", "nvptx"]: check_device(target)
def compile_conv2d_NHWC_gemm_int8_arm( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8") W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8") bias = te.placeholder((num_filter, ), name="bias", dtype="int8") dtype = "int32" devices = [ ( "llvm --device arm_cpu --mtriple aarch64-linux-gnu", topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, ), ( "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, ), ( "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", topi.arm_cpu.compute_conv2d_NHWC_quantized_native, topi.arm_cpu.schedule_conv2d_NHWC_quantized_native, ), # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension # ( # "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm", # topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, # topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, # ), ] for device_tuple in devices: target = device_tuple[0] compute = device_tuple[1] schedule = device_tuple[2] dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Compiling on arm AArch64 target: %s" % target) with tvm.target.Target(target): assert is_aarch64_arm(), "AArch64 target not recognized" C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = schedule([C]) if add_bias: tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], target, name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), )
def verify_multibox_prior(dshape, sizes=(1, ), ratios=(1, ), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): data = te.placeholder(dshape, name="data") dtype = data.dtype input_data = np.random.uniform(size=dshape).astype(dtype) in_height = data.shape[2].value in_width = data.shape[3].value num_sizes = len(sizes) num_ratios = len(ratios) size_ratio_concat = sizes + ratios steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width offset_h = offsets[0] offset_w = offsets[1] oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4) np_out = np.zeros(oshape).astype(dtype) for i in range(in_height): center_h = (i + offset_h) * steps_h for j in range(in_width): center_w = (j + offset_w) * steps_w for k in range(num_sizes + num_ratios - 1): w = (size_ratio_concat[k] * in_height / in_width / 2.0 if k < num_sizes else size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0) h = (size_ratio_concat[k] / 2.0 if k < num_sizes else size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0) count = (i * in_width * (num_sizes + num_ratios - 1) + j * (num_sizes + num_ratios - 1) + k) np_out[0][count][0] = center_w - w np_out[0][count][1] = center_h - h np_out[0][count][2] = center_w + w np_out[0][count][3] = center_h + h if clip: np_out = np.clip(np_out, 0, 1) def check_device(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Running on target: %s" % target) fcompute, fschedule = tvm.topi.testing.dispatch( target, _multibox_prior_implement) with tvm.target.Target(target): out = fcompute(data, sizes, ratios, steps, offsets, clip) s = fschedule(out) tvm_input_data = tvm.nd.array(input_data, dev) tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), dev) f = tvm.build(s, [data, out], target) f(tvm_input_data, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3) for target in ["llvm", "opencl", "cuda"]: check_device(target)
import tvm from tvm import te from tensorizer.intrinsics import INTRINSICS import numpy as np n, m, k = 64, 192, 1024 a = te.placeholder((n, k), 'float16') b = te.placeholder((m // 32, k // 32, 32, 32), 'float16') block_k = 2 rv = te.reduce_axis((0, k), ) def compute(x, y): lhs = a[x, rv].astype('float32') rhs = b[y // 32, rv // 32, rv % 32, y % 32].astype('float32') return te.sum(lhs * rhs, axis=[rv]) c = te.compute((n, m), compute) blkY = tvm.te.thread_axis('blockIdx.y') blkX = tvm.te.thread_axis('blockIdx.x') thrZ = tvm.te.thread_axis('threadIdx.z') thrY = tvm.te.thread_axis('threadIdx.y') thrX = tvm.te.thread_axis('threadIdx.x') sch = te.create_schedule(c.op)
def test_llvm_add_pipeline(): nn = 1024 n = tvm.runtime.convert(nn) A = te.placeholder((n, ), name="A") B = te.placeholder((n, ), name="B") C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") s = te.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) s[C].parallel(xo) s[C].vectorize(xi) def verify_elf(path, e_machine): with open(path, "rb") as fi: arr = fi.read(20) assert struct.unpack("ccc", arr[1:4]) == (b"E", b"L", b"F") endian = struct.unpack("b", arr[0x5:0x6])[0] endian = "<" if endian == 1 else ">" assert struct.unpack(endian + "h", arr[0x12:0x14])[0] == e_machine def build_i386(): temp = util.tempdir() target = "llvm -mtriple=i386-pc-linux-gnu" f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x03) def build_arm(): target = "llvm -mtriple=armv7-none-linux-gnueabihf" if not tvm.runtime.enabled(target): print("Skip because %s is not enabled.." % target) return temp = util.tempdir() f = tvm.build(s, [A, B, C], target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x28) asm_path = temp.relpath("myadd.asm") f.save(asm_path) # Do a RPC verification, launch kernel on Arm Board if available. host = os.environ.get("TVM_RPC_ARM_HOST", None) remote = None if host: port = int(os.environ["TVM_RPC_ARM_PORT"]) try: remote = rpc.connect(host, port) except tvm.error.TVMError as e: pass if remote: remote.upload(path) farm = remote.load_module("myadd.o") ctx = remote.cpu(0) n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) farm(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) print("Verification finish on remote..") build_i386() build_arm()