def tvm_conv(batch, channel, out_channel, height, width, k_h, k_w, stride, pad, target, devid=0, number=10): A = tvm.te.placeholder((batch, channel, height, width), dtype="float32") W = tvm.te.placeholder((out_channel, channel, k_h, k_w), dtype="float32") Output = conv2d_nchw(A, W, stride=stride, padding=pad) s = tvm.te.create_schedule(Output.op) bufs = [A, W, Output] return evaluate(s, bufs, target, devid, number)
def try_yolo_conv(config, parameter, fsch): # get the compute # (1, 3, 448, 448, 64, 3, 7, 7, 1, 2, 3, 1, 1) batch, CI, H, W, CO, _, kh, kw, _, st, pad, dilation, group = config inputs = tvm.placeholder((batch, CI, H, W), dtype="float32") weight = tvm.placeholder((CO, CI, kh, kw), dtype="float32") outputs = conv2d_nchw(inputs, weight, stride=st, padding=pad, dilation=dilation, groups=group) s = tvm.create_schedule(outputs.op) fsch(s, outputs, inputs, weight, parameter) arg_bufs = [inputs, weight, outputs] stmt = tvm.lower(s, arg_bufs, simple_mode=True) # print(stmt) dev_id = 2 ctx = tvm.nd.context("cuda", dev_id) max_dims = ctx.max_thread_dimensions kwargs = { "max_shared_memory_per_block": ctx.max_shared_memory_per_block, "max_threads_per_block": ctx.max_threads_per_block, "max_thread_x": max_dims[0], "max_thread_y": max_dims[1], "max_thread_z": max_dims[2] } verify = tvm.ir_pass.VerifyGPUCode(stmt, kwargs) # print("config is:\n %s" % (str(config))) if verify: print("Valid kernel") time_cost = _evaluate(s, arg_bufs, "cuda", dev_id, 10) print("Yolo conv use", time_cost, "ms\n") else: print("Invalid kernel") time_cost = float("inf") return time_cost
def conv2d(N, C, H, W, K, kernel_size, stride=1, padding=0, dilation=1, groups=1): Img = tvm.te.placeholder((N, C, H, W)) W = tvm.te.placeholder((K, C // groups, kernel_size, kernel_size)) Output = conv2d_nchw(Img, W, stride=stride, padding=padding, dilation=dilation, groups=groups) return [Output.op], [Img, W, Output]
def conv2d_batching(N, H, W, CO, CI, KH, KW, stride, padding): data = tvm.placeholder((N, CI, H, W), name='data', dtype="float32") kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel', dtype="float32") conv = conv2d_nchw(data, kernel, stride=stride, padding=padding) s = tvm.create_schedule([conv.op]) ##### space definition begin ##### n, f, y, x = s[conv].op.axis rc, ry, rx = s[conv].op.reduce_axis fused = s[conv].fuse(y, x) cfg = autotvm.get_config() cfg.define_split("tile_n", n, num_outputs=4) cfg.define_split("tile_f", f, num_outputs=4) cfg.define_split("tile_rc", rc, num_outputs=3) cfg.define_split("tile_ry", ry, num_outputs=3) cfg.define_split("tile_rx", rx, num_outputs=3) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) ##### space definition end ##### # inline padding pad_data = s[conv].op.input_tensors[0] s[pad_data].compute_inline() data, raw_data = pad_data, data output = conv OL = s.cache_write(conv, 'local') # create cache stage AA = s.cache_read(data, 'shared', [OL]) WW = s.cache_read(kernel, 'shared', [OL]) AL = s.cache_read(AA, 'local', [OL]) WL = s.cache_read(WW, 'local', [OL]) # tile and bind spatial axes n, f, y, x = s[output].op.axis yx = s[output].fuse(y, x) bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n) bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) kernel_scope = yx s[output].bind(yx, tvm.thread_axis("blockIdx.z")) s[output].bind(bn, tvm.thread_axis("blockIdx.y")) s[output].bind(bf, tvm.thread_axis("blockIdx.x")) s[output].bind(vn, tvm.thread_axis("vthread")) s[output].bind(vf, tvm.thread_axis("vthread")) s[output].bind(tn, tvm.thread_axis("threadIdx.y")) s[output].bind(tf, tvm.thread_axis("threadIdx.x")) s[output].reorder(yx, bn, bf, vn, vf, tn, tf, ni, fi) s[OL].compute_at(s[output], tf) # tile reduction axes n, f, yx = s[OL].op.axis rc, ry, rx = s[OL].op.reduce_axis rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc) ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry) rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx) s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, yx) s[AA].compute_at(s[OL], rxo) s[WW].compute_at(s[OL], rxo) s[AL].compute_at(s[OL], rxm) s[WL].compute_at(s[OL], rxm) # cooperative fetching for load in [AA, WW]: n, f, y, x = s[load].op.axis fused = s[load].fuse(n, f, y, x) ty, fused = s[load].split(fused, nparts=cfg["tile_n"].size[2]) tx, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) s[load].bind(ty, tvm.thread_axis("threadIdx.y")) s[load].bind(tx, tvm.thread_axis("threadIdx.x")) # tune unroll s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) return s, [raw_data, kernel, conv]
data.set_op_relation(op) return data if __name__ == "__main__": from flextensor.nn import conv2d_nchw N = 16 C = 256 H = 14 W = 14 K = 512 k = 3 Img = tvm.te.placeholder([N, C, H, W], dtype="float32") Kernel = tvm.te.placeholder([K, C, k, k], dtype="float32") Outputs = conv2d_nchw(Img, Kernel, None, 1, 1, 1, 1) a = tvm.te.placeholder([4, 4]) b = tvm.te.compute([4, 4], lambda i, j: a[ (3 * i + j) * 4 - 12 * i - 4 * j + i, j]) data = build(Outputs.op) var1 = IndexNode(Outputs.op, 0, "spatial") var2 = Outputs.op.axis[0].var var3 = Outputs.op.body[0].source[0].a.args[0] print(var1, var2, var3) tmp = {} tmp[var1] = 999 print(tmp[var2], tmp[var3]) print(str(data))
def main(): batch = 2 dtype = "float64" img = tvm.te.placeholder([batch, 1, 32, 32], dtype=dtype, name="img") label = tvm.te.placeholder([batch, 10], dtype=dtype, name="label") weight_1 = tvm.te.placeholder([6, 1, 5, 5], dtype=dtype, name="w1") weight_2 = tvm.te.placeholder([16, 6, 5, 5], dtype=dtype, name="w2") weight_3 = tvm.te.placeholder([120, 16, 5, 5], dtype=dtype, name="w3") weight_4 = tvm.te.placeholder([120, 84], dtype=dtype, name="w4") weight_5 = tvm.te.placeholder([84, 10], dtype=dtype, name="w5") act = tanh # ReLU t1 = conv2d_nchw(img, weight_1, None, 1, 0, 1, 1) t2 = act(t1) t3 = avgpool(t2) t4 = conv2d_nchw(t3, weight_2, None, 1, 0, 1, 1) t5 = act(t4) t6 = avgpool(t5) t7 = conv2d_nchw(t6, weight_3, None, 1, 0, 1, 1) t8 = act(t7) # t9 = avgpool(t8) t10 = flatten_gemm(t8, weight_4) t11 = (gemm(t10, weight_5)) t12 = softmax(t11) # t13 = sum_all(t12) t13 = mse_loss(t12, label) d1, d2, d3, d4, d5 = tvm.te.mygradient( t13, [weight_1, weight_2, weight_3, weight_4, weight_5]) s = tvm.te.create_schedule([t13.op, d1.op, d2.op, d3.op, d4.op, d5.op]) func = tvm.build(s, [ img, label, weight_1, weight_2, weight_3, weight_4, weight_5, t13, d1, d2, d3, d4, d5 ], target="llvm") free_vars = [weight_1, weight_2, weight_3, weight_4, weight_5] gradients = [d1, d2, d3, d4, d5] params = [] for var in free_vars: shape = to_tuple(var.shape) var_np = np.random.uniform(-100, 100, shape).astype(dtype) params.append(var_np) img_np = np.random.uniform(-10, 10, to_tuple(img.shape)).astype(dtype) label_np = np.random.uniform(-10, 10, to_tuple(label.shape)).astype(dtype) ret_np = np.zeros(to_tuple(t13.shape)).astype(dtype) inits = [] for var in gradients: shape = to_tuple(var.shape) var_np = np.zeros(shape).astype(dtype) inits.append(var_np) ctx = tvm.device("llvm") img_tvm = tvm.nd.array(img_np, ctx) label_tvm = tvm.nd.array(label_np, ctx) ret_tvm = tvm.nd.array(ret_np, ctx) free_vars_tvm = [tvm.nd.array(x, ctx) for x in params] gradients_tvm = [tvm.nd.array(x, ctx) for x in inits] func(img_tvm, label_tvm, *free_vars_tvm, ret_tvm, *gradients_tvm) ret_torch, grad_torch = pytorch_result(img_np, label_np, params) print(ret_tvm) print(ret_torch) tvm.testing.assert_allclose(ret_tvm.asnumpy(), ret_torch.detach().numpy(), atol=1e-3, rtol=1e-5) for i in range(len(gradients_tvm)): print("grad_torch", i, grad_torch[i].detach().T.numpy()) if i > 2: tvm.testing.assert_allclose(gradients_tvm[i].asnumpy(), grad_torch[i].detach().T.numpy(), atol=1e-3, rtol=1e-5) else: tvm.testing.assert_allclose(gradients_tvm[i].asnumpy(), grad_torch[i].detach().numpy(), atol=1e-3, rtol=1e-5) print("Compare to Pytorch success!")