def _create_axis_record(attrs, inputs): axes = attrs.axis if attrs.axis is None else list( get_const_tuple(attrs.axis)) exclude = get_const_int(attrs.exclude) > 0 keepdims = get_const_int(attrs.keepdims) > 0 data_shape = inputs[0] shape_size = data_shape.shape[0].value axis_record = [-1] * shape_size if axes is None: axes = list(range(shape_size)) for i, axis in enumerate(axes): if axis < 0: axes[i] = shape_size + axis if exclude: ex_axes = [] for i in range(shape_size): if i not in axes: ex_axes.append(i) axes = ex_axes for i in range(shape_size): if i not in axes: axis_record[i] = i if not keepdims: tmp = [] for i in axis_record: if i >= 0: tmp.append(i) axis_record = tmp return axis_record
def split_shape_func(attrs, inputs, _): """ Shape function for split op. """ if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)): indices_or_sections = get_const_int(attrs.indices_or_sections) assert indices_or_sections > 0, "Slice count must be > 0" else: indices_or_sections = list(get_const_tuple(attrs.indices_or_sections)) assert sorted(indices_or_sections)[0] > 0 and indices_or_sections == sorted( indices_or_sections ), "split_indices must be sorted" axis = get_const_int(attrs.axis) if axis < 0: axis += get_const_int(inputs[0].shape[0]) num_out = ( indices_or_sections if isinstance(indices_or_sections, int) else len(indices_or_sections) + 1 ) if isinstance(indices_or_sections, int): indices_or_sections = [indices_or_sections] return [ _split_shape_func(inputs[0], convert(i), convert(indices_or_sections), convert(axis)) for i in range(num_out) ]
def _compute_get_valid_counts(attrs, inputs, out_type): score_threshold = inputs[1] id_index = get_const_int(attrs.id_index) score_index = get_const_int(attrs.score_index) if attrs.score_threshold is not None: score_threshold = get_const_float(attrs.score_threshold) return topi_compute(inputs[0], score_threshold, id_index, score_index)
def _schedule(cfg, s, data_vec, weight_vec, output): s[data_vec].parallel(s[data_vec].op.axis[0]) s[weight_vec].parallel(s[weight_vec].op.axis[0]) y, x = s[output].op.axis wb, db, k = s[output].op.reduce_axis yo, yi = cfg["tile_y"].apply(s, output, y) xo, xi = cfg["tile_x"].apply(s, output, x) ko, ki = cfg["tile_k"].apply(s, output, k) cfg["reorder_0"].apply(s, output, [yo, xo, ko, yi, wb, db, ki, xi]) cfg["ann_reduce"].apply( s, output, [db, wb], axis_lens=[ get_const_int(db.dom.extent), get_const_int(wb.dom.extent) ], max_unroll=8, cfg=cfg, ) cfg["ann_spatial"].apply( s, output, [yi, xi], axis_lens=[cfg["tile_y"].size[-1], cfg["tile_x"].size[-1]], max_unroll=8, cfg=cfg, ) s[output].vectorize(xi) s[output].parallel(yo) return s
def _compute_argsort(attrs, inputs, _): axis = get_const_int(attrs.axis) is_ascend = bool(get_const_int(attrs.is_ascend)) dtype = attrs.dtype return [ topi_compute(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype) ]
def expand_dim_shape_func(attrs, inputs, _): """ Shape function for expand_dim op. """ axis = get_const_int(attrs.axis) num_newaxis = get_const_int(attrs.num_newaxis) if axis < 0: axis = inputs[0].shape[0] + axis + 1 ndim = inputs[0].shape[0] if inputs[0].shape else 0 return [_expand_dim_shape_func(inputs[0], convert(ndim), convert(axis), convert(num_newaxis))]
def gather_nd_shape_func(attrs, inputs, _): """ Shape func for gather_nd operator. """ batch_dims = get_const_int(attrs.batch_dims) index_rank = get_const_int(attrs.index_rank) assert index_rank > 0, "index_rank needs to be specified for dynamic gather_nd" return [_gather_nd_shape(inputs[0], inputs[1], convert(batch_dims), convert(index_rank))]
def _callback(op): if "conv2d_nhwc" not in op.tag: return ### extract tensors ### output = op.output(0) conv = op data_vec = conv.input_tensors[0] kernel = conv.input_tensors[1] # pylint: disable=unused-variable last = outs[0] # pylint: disable=unused-variable # tile reduction axes n, oh, ow, co = sched[conv].op.axis kh, kw, ci = sched[conv].op.reduce_axis # NOTE we can't inline data padding in the SIMD path, because it # introduces conditionals in the inner loop. data_pad = data_vec.op sched[data_pad].compute_inline() co, vc = cfg["tile_co"].apply(sched, conv, co) oh, vh = cfg["tile_oh"].apply(sched, conv, oh) ow, vw = cfg["tile_ow"].apply(sched, conv, ow) cfg["reorder_0"].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc]) cfg["ann_reduce"].apply( sched, conv, [kh, kw], axis_lens=[ get_const_int(kh.dom.extent), get_const_int(kw.dom.extent) ], max_unroll=8, cfg=cfg, ) cfg["ann_spatial"].apply( sched, conv, [vh, vw, vc], axis_lens=[ cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1], cfg["tile_co"].size[-1] ], max_unroll=8, cfg=cfg, ) kernel_scope = n # this is the scope to attach global config inside this kernel # tune unroll sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
def _compute_topk(attrs, inputs, out_type): if attrs.k is not None: k = attrs.k else: k = inputs[1] axis = get_const_int(attrs.axis) ret_type = attrs.ret_type is_ascend = bool(get_const_int(attrs.is_ascend)) dtype = attrs.dtype out = topi_compute(inputs[0], k, axis, ret_type, is_ascend, dtype) out = out if isinstance(out, list) else [out] return out
def _compute_multibox_transform_loc(attrs, inputs, _): """Compute definition of multibox_detection""" clip = bool(get_const_int(attrs.clip)) threshold = get_const_float(attrs.threshold) variances = get_float_tuple(attrs.variances) return topi_compute(inputs[0], inputs[1], inputs[2], clip, threshold, variances)
def _compute_multibox_prior(attrs, inputs, _): """Compute definition of multibox_prior""" sizes = get_float_tuple(attrs.sizes) ratios = get_float_tuple(attrs.ratios) steps = get_float_tuple(attrs.steps) offsets = get_float_tuple(attrs.offsets) clip = bool(get_const_int(attrs.clip)) return [topi_compute(inputs[0], sizes, ratios, steps, offsets, clip)]
def stack_shape_func(attrs, inputs, _): """ Shape func for stack. """ axis = get_const_int(attrs.axis) if axis < 0: axis += inputs[0].shape[0] + 1 return [_stack_shape_func(inputs[0], convert(axis), convert(len(inputs)))]
def repeat_shape_func(attrs, inputs, _): """ Shape func for repeat. """ axis = get_const_int(attrs.axis) if axis < 0: axis = inputs[0].shape[0] + axis return [_repeat_shape_func(inputs[0], attrs.repeats, convert(axis))]
def take_shape_func(attrs, inputs, out_ndims): """ Shape function for take op. """ if attrs.axis is None: return [_take_no_axis_shape_func(inputs[1], out_ndims[0])] axis = get_const_int(attrs.axis) batch_dims = get_const_int(attrs.batch_dims) data_ndim = int(inputs[0].shape[0]) if inputs[1].shape: indicies_ndim = int(inputs[1].shape[0]) if axis < 0: axis += data_ndim assert 0 <= axis < data_ndim if batch_dims < 0: batch_dims += indicies_ndim return [_take_with_axis_shape_func(*inputs, convert(axis), convert(batch_dims), out_ndims[0])]
def bitpack(data, bits, pack_type="int8", name="bitpack"): """Packs lowest dimension into format needed by VTA Parameters ---------- pack_axis : int index of the axis to pack in data bit_axis : int index of axis to place bit axis in resulting packed data Returns ------- packed : Tensor The packed tensor. """ shape_vec = list(data.shape) if pack_type == "int8": data_width = 8 elif pack_type == "int16": data_width = 16 elif pack_type == "int32": data_width = 32 else: raise RuntimeError("Unknown pack type %s" % pack_type) assert data_width % bits == 0 lanes = data_width // bits # Data must be in multiples of the data_width assert utils.get_const_int( shape_vec[-1]) % lanes == 0, "Not a multiple of word size" shape_vec[-1] = shape_vec[-1] // lanes oshape = tuple(shape_vec) def _bitpack(*indices): ret = None mask = tvm.tir.const((1 << bits) - 1, pack_type) for k in range(lanes): idx = list(indices) idx[-1] = idx[-1] * lanes + k elem = data(*idx).astype(pack_type) if k == 0: ret = elem & mask else: val = (elem & mask) << tvm.tir.const(k * bits, pack_type) ret = ret | val return ret return te.compute(oshape, _bitpack, name=name, tag="bitpack")
def _compute_nms(attrs, inputs, out_type): max_output_size = inputs[3] iou_threshold = inputs[4] if attrs.max_output_size is not None: max_output_size = attrs.max_output_size if attrs.iou_threshold is not None: iou_threshold = get_const_float(attrs.iou_threshold) return_indices = bool(get_const_int(attrs.return_indices)) force_suppress = bool(get_const_int(attrs.force_suppress)) top_k = get_const_int(attrs.top_k) coord_start = get_const_int(attrs.coord_start) score_index = get_const_int(attrs.score_index) id_index = get_const_int(attrs.id_index) invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom)) if return_indices: return topi_compute( inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold, force_suppress, top_k, coord_start, score_index, id_index, return_indices, invalid_to_bottom, ) return [ topi_compute( inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold, force_suppress, top_k, coord_start, score_index, id_index, return_indices, invalid_to_bottom, ) ]
def _compute_proposal(attrs, inputs, out_type): scales = get_float_tuple(attrs.scales) ratios = get_float_tuple(attrs.ratios) feature_stride = attrs.feature_stride threshold = attrs.threshold rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n rpn_post_nms_top_n = attrs.rpn_post_nms_top_n rpn_min_size = attrs.rpn_min_size iou_loss = bool(get_const_int(attrs.iou_loss)) return [ topi_compute( inputs[0], inputs[1], inputs[2], scales, ratios, feature_stride, threshold, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss, ) ]
def test_util(): x = tvm.tir.const(100, "int32") assert utils.get_const_int(x) == 100 assert utils.get_const_tuple((x, x)) == (100, 100)
def schedule_conv2d_winograd(cfg, s, output, pre_computed): """Schedule winograd template""" inverse = s[output].op.input_tensors[0] bgemm, A = s[inverse].op.input_tensors kernel_pack, data_pack_trans = s[bgemm].op.input_tensors data_pack = s[data_pack_trans].op.input_tensors[0] input_tile, B = s[data_pack].op.input_tensors pad_data = s[input_tile].op.input_tensors[0] # data transform s[B].compute_inline() s[A].compute_inline() # probably will improve real topology execution if autotvm.GLOBAL_SCOPE.in_tuning: # Padding to texture AA = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [input_tile]) bind_data_copy(s[AA]) s[input_tile].compute_inline() OL = s.cache_write(data_pack, "local") c, p, eps, nu, cb = s[data_pack].op.axis fused = s[data_pack].fuse(c, p, eps, nu) bx, tx = s[data_pack].split(fused, 128) s[data_pack].vectorize(cb) s[data_pack].bind(bx, te.thread_axis("blockIdx.x")) s[data_pack].bind(tx, te.thread_axis("threadIdx.x")) _, _, eps, nu, cb = s[OL].op.axis r_a, r_b = s[OL].op.reduce_axis s[OL].unroll(eps) s[OL].unroll(nu) s[OL].unroll(r_a) s[OL].unroll(r_b) s[OL].vectorize(cb) s[OL].compute_at(s[data_pack], tx) s[data_pack].set_scope(get_texture_storage(data_pack.shape)) s[data_pack_trans].compute_inline() # transform kernel if not pre_computed: kernel, G = s[kernel_pack].op.input_tensors eps, nu, ci, co, cob = s[kernel_pack].op.axis if autotvm.GLOBAL_SCOPE.in_tuning: # skip this part during tuning to make recrods accurate # this part will be pre-computed during pre-compute optimization pass s[G].pragma(s[G].op.axis[0], "debug_skip_region") s[kernel_pack].pragma(eps, "debug_skip_region") else: s[G].compute_inline() r_a, r_b = s[kernel_pack].op.reduce_axis for axis in [eps, nu, r_a, r_b]: s[kernel_pack].unroll(axis) fused = s[kernel_pack].fuse(ci, co) bb, tt = s[kernel_pack].split(fused, 128) s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b, cob) s[kernel_pack].vectorize(cob) s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x")) s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x")) else: kernel = kernel_pack if isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag: # manage scheduling of datacopy pack_data = pad_data.op.input_tensors[0] bind_data_copy(s[pack_data]) bind_data_copy(s[kernel]) elif isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: s[kernel].compute_inline() s[pad_data].compute_inline() ##### space definition begin ##### cfg.define_knob("auto_unroll_max_step", [0, 4, 16]) b1, b2, y, x, cb = s[bgemm].op.axis rcc = s[bgemm].op.reduce_axis[0] alpha = get_const_int(b1.dom.extent) cfg.define_split( "tile_y", y, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] <= 16 ) min_x_div = 1 for bn in range(4, 0, -1): if bgemm.shape[3] % bn == 0: min_x_div = bn break cfg.define_split( "tile_x", x, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] >= min_x_div and entry.size[1] <= 16, ) cfg.define_split("tile_rc", rcc, num_outputs=2) # TODO: Uncomment the following lines when multi_filter will be introduced # cfg.multi_filter( # filter=lambda entity: entity["tile_y"].size[2] * entity["tile_x"].size[2] in range(32,1024) # ) ##### space definition end ##### # batch gemm OL = s.cache_write(bgemm, "local") if ( autotvm.GLOBAL_SCOPE.in_tuning or isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag ): BB = s.cache_read(kernel_pack, get_texture_storage(kernel_pack.shape), [OL]) bind_data_copy(s[BB]) by = s[bgemm].fuse(b1, b2, y) # tile and bind spatial axes bgemm_scope, by = s[bgemm].split(by, nparts=1) by, vy, ty = cfg["tile_y"].apply(s, bgemm, by) bx, vx, tx = cfg["tile_x"].apply(s, bgemm, x) s[bgemm].bind(by, te.thread_axis("blockIdx.y")) s[bgemm].bind(bx, te.thread_axis("blockIdx.x")) s[bgemm].bind(vy, te.thread_axis("vthread")) s[bgemm].bind(vx, te.thread_axis("vthread")) s[bgemm].bind(ty, te.thread_axis("threadIdx.y")) s[bgemm].bind(tx, te.thread_axis("threadIdx.x")) s[bgemm].reorder(bgemm_scope, by, bx, vy, vx, ty, tx, cb) s[bgemm].vectorize(cb) s[bgemm].set_scope(get_texture_storage(bgemm.shape)) # tile reduction axes s[OL].compute_at(s[bgemm], tx) b1, b2, y, x, cb = s[OL].op.axis (rcc, rcb) = s[OL].op.reduce_axis b = s[OL].fuse(b1, b2) s[OL].reorder(b, y, x, rcc, rcb, cb) # s[OL].unroll(rcb) s[OL].pragma(rcb, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) s[OL].pragma(rcb, "unroll_explicit", True) s[OL].vectorize(cb) # schedule inverse, output and fusion if output.op in s.outputs: OL = None else: OL = output s[OL].set_scope("local") output = s.outputs[0] if len(s[output].op.axis) == 4: n, co, h, w = s[output].op.axis cb = None else: n, co, h, w, cb = s[output].op.axis inverse_scope, n = s[output].split(n, nparts=1) fused = s[output].fuse(n, co, h, w) bb, tt = s[output].split(fused, 128) if cb is not None: s[output].reorder(bb, tt, cb) s[output].vectorize(cb) s[output].bind(bb, te.thread_axis("blockIdx.x")) s[output].bind(tt, te.thread_axis("threadIdx.x")) if OL is not None: s[OL].compute_at(s[output], tt) co, p, vh, vw, cb = s[inverse].op.axis r_a, r_b = s[inverse].op.reduce_axis for axis in [vh, vw, r_a, r_b]: s[inverse].unroll(axis) s[inverse].vectorize(cb) s[inverse].compute_at(s[output], tt) return s
def _compute_sort(attrs, inputs, _): axis = get_const_int(attrs.axis) is_ascend = bool(get_const_int(attrs.is_ascend)) return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend)]
def concatenate_shape_func(attrs, inputs, _): axis = get_const_int(attrs.axis) if axis < 0: axis += inputs[0].shape[0] return [_concatenate_shape_func(inputs, convert(axis))]