def pool_dsp_schedule(outs, layout): """Schedule function for v7e-m DSP instructions of pooling.""" s = te.create_schedule([x.op for x in outs]) def _callback(op): in_dtype = op.input_tensors[0].dtype if "pool_max" in op.tag: if in_dtype != "int8": logger.warning("Does not have micro-kernel for %s maxpool.", in_dtype) elif layout == "NWC": schedule_maxpool_1d_nwc(s, op) elif layout == "NHWC": schedule_maxpool_2d_nhwc(s, op) elif "pool_sum" in op.tag: if in_dtype != "int16": logger.warning("Does not have micro-kernel for %s avgpool.", in_dtype) elif layout == "NCW": schedule_avgpool_1d_ncw(s, op) elif layout == "NCHW": schedule_avgpool_2d_nchw(s, op) traverse_inline(s, outs[-1].op, _callback) return s
def schedule_conv2d_winograd_impl(cfg, outs, tag, pre_computed=False): outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == tag: schedule_conv2d_winograd(cfg, s, op.output(0), pre_computed=pre_computed) traverse_inline(s, outs[0].op, _callback) return s
def conv2d_nhwc_dsp_schedule(cfg, outs): """Schedule function for v7e-m DSP instructions of conv2d.""" sched = te.create_schedule([x.op for x in outs]) def _callback(op): if "conv2d_nhwc" not in op.tag: return # extract tensors output = op.output(0) conv = op data_vec = conv.input_tensors[0] kernel = conv.input_tensors[1] # pylint: disable=unused-variable last = outs[0] # pylint: disable=unused-variable source_index_w = output.op.body[0].source[0].a.value.indices[2].a stride_w = source_index_w.b.value if isinstance(source_index_w, Mul) else 1 # tile reduction axes n, oh, ow, co = sched[conv].op.axis kh, kw, ci = sched[conv].op.reduce_axis M = cfg["tile_ow"].size[-1] K = cfg["tile_ci"].size[-1] N = cfg["tile_co"].size[-1] owo, owi = cfg["tile_ow"].apply(sched, conv, ow) cio, cii = cfg["tile_ci"].apply(sched, conv, ci) coo, coi = cfg["tile_co"].apply(sched, conv, co) cfg["reorder_0_simd"].apply( sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii]) gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype, stride_w) sched[output].tensorize(owi, gemm) sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id)) # this is the scope to attach global config inside this kernel kernel_scope = n # tune unroll sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) traverse_inline(sched, outs[-1].op, _callback) return sched
def dense_dsp_schedule(outs): """Schedule function for v7e-m DSP instructions of dense.""" sched = te.create_schedule([x.op for x in outs]) def _callback(op): if "dense" not in op.tag: return # extract tensors output = op.output(0) dense = op data_vec = dense.input_tensors[0] M, K = data_vec.shape N, _ = dense.input_tensors[1].shape n, _ = sched[dense].op.axis no, ni = sched[dense].split(n, nparts=1) gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype) sched[output].tensorize(ni, gemm) sched[output].pragma(no, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id)) traverse_inline(sched, outs[-1].op, _callback) return sched
def conv2d_direct_nhwc_schedule(cfg, outs): """Schedule function for directly-scheduled conv2d on NHWC layout.""" sched = tvm.create_schedule([x.op for x in outs]) def _callback(op): if "conv2d_nhwc" not in op.tag: return ### extract tensors ### output = op.output(0) conv = op data_vec = conv.input_tensors[0] kernel = conv.input_tensors[1] # pylint: disable=unused-variable last = outs[0] # pylint: disable=unused-variable # tile reduction axes n, oh, ow, co = sched[conv].op.axis kh, kw, ci = sched[conv].op.reduce_axis # NOTE we can't inline data padding in the SIMD path, because it # introduces conditionals in the inner loop. data_pad = data_vec.op sched[data_pad].compute_inline() co, vc = cfg["tile_co"].apply(sched, conv, co) oh, vh = cfg["tile_oh"].apply(sched, conv, oh) ow, vw = cfg["tile_ow"].apply(sched, conv, ow) cfg["reorder_0"].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc]) cfg["ann_reduce"].apply( sched, conv, [kh, kw], axis_lens=[ get_const_int(kh.dom.extent), get_const_int(kw.dom.extent) ], max_unroll=8, cfg=cfg, ) cfg["ann_spatial"].apply( sched, conv, [vh, vw, vc], axis_lens=[ cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1], cfg["tile_co"].size[-1] ], max_unroll=8, cfg=cfg, ) kernel_scope = n # this is the scope to attach global config inside this kernel # tune unroll sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) traverse_inline(sched, outs[-1].op, _callback) return sched