def _schedule(cfg, s, data_vec, weight_vec, output): s[data_vec].parallel(s[data_vec].op.axis[0]) s[weight_vec].parallel(s[weight_vec].op.axis[0]) y, x = s[output].op.axis wb, db, k = s[output].op.reduce_axis yo, yi = cfg["tile_y"].apply(s, output, y) xo, xi = cfg["tile_x"].apply(s, output, x) ko, ki = cfg["tile_k"].apply(s, output, k) cfg["reorder_0"].apply(s, output, [yo, xo, ko, yi, wb, db, ki, xi]) cfg["ann_reduce"].apply(s, output, [db, wb], axis_lens=[ get_const_int(db.dom.extent), get_const_int(wb.dom.extent) ], max_unroll=8, cfg=cfg) cfg["ann_spatial"].apply( s, output, [yi, xi], axis_lens=[cfg['tile_y'].size[-1], cfg['tile_x'].size[-1]], max_unroll=8, cfg=cfg) s[output].vectorize(xi) s[output].parallel(yo) return s
def split_shape_func(attrs, inputs, _): """ Shape function for split op. """ if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)): indices_or_sections = get_const_int(attrs.indices_or_sections) assert indices_or_sections > 0, "Slice count must be > 0" else: indices_or_sections = list(get_const_tuple(attrs.indices_or_sections)) assert sorted( indices_or_sections)[0] > 0 and indices_or_sections == sorted( indices_or_sections), "split_indices must be sorted" axis = get_const_int(attrs.axis) if axis < 0: axis += get_const_int(inputs[0].shape[0]) num_out = (indices_or_sections if isinstance(indices_or_sections, int) else len(indices_or_sections) + 1) if isinstance(indices_or_sections, int): indices_or_sections = [indices_or_sections] return [ _split_shape_func(inputs[0], convert(i), convert(indices_or_sections), convert(axis)) for i in range(num_out) ]
def _create_axis_record(attrs, inputs): axes = attrs.axis if attrs.axis is None else list( get_const_tuple(attrs.axis)) exclude = get_const_int(attrs.exclude) > 0 keepdims = get_const_int(attrs.keepdims) > 0 data_shape = inputs[0] shape_size = data_shape.shape[0].value axis_record = [-1] * shape_size if axes is None: axes = list(range(shape_size)) for i, axis in enumerate(axes): if axis < 0: axes[i] = shape_size + axis if exclude: ex_axes = [] for i in range(shape_size): if i not in axes: ex_axes.append(i) axes = ex_axes for i in range(shape_size): if i not in axes: axis_record[i] = i if not keepdims: tmp = [] for i in axis_record: if i >= 0: tmp.append(i) axis_record = tmp return axis_record
def _compute_argsort(attrs, inputs, _): axis = get_const_int(attrs.axis) is_ascend = bool(get_const_int(attrs.is_ascend)) dtype = attrs.dtype return [ topi_compute(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype) ]
def expand_dim_shape_func(attrs, inputs, _): """ Shape function for expand_dim op. """ axis = get_const_int(attrs.axis) num_newaxis = get_const_int(attrs.num_newaxis) if axis < 0: axis = inputs[0].shape[0] + axis + 1 ndim = inputs[0].shape[0] if inputs[0].shape else 0 return [_expand_dim_shape_func(inputs[0], convert(ndim), convert(axis), convert(num_newaxis))]
def _compute_topk(attrs, inputs, out_type): if attrs.k is not None: k = attrs.k else: k = inputs[1] axis = get_const_int(attrs.axis) ret_type = attrs.ret_type is_ascend = bool(get_const_int(attrs.is_ascend)) dtype = attrs.dtype out = topi_compute(inputs[0], k, axis, ret_type, is_ascend, dtype) out = out if isinstance(out, list) else [out] return out
def _callback(op): if 'conv2d_nhwc' not in op.tag: return ### extract tensors ### output = op.output(0) conv = op data_vec = conv.input_tensors[0] kernel = conv.input_tensors[1] # pylint: disable=unused-variable last = outs[0] # pylint: disable=unused-variable # tile reduction axes n, oh, ow, co = sched[conv].op.axis kh, kw, ci = sched[conv].op.reduce_axis # NOTE we can't inline data padding in the SIMD path, because it # introduces conditionals in the inner loop. data_pad = data_vec.op sched[data_pad].compute_inline() co, vc = cfg['tile_co'].apply(sched, conv, co) oh, vh = cfg['tile_oh'].apply(sched, conv, oh) ow, vw = cfg['tile_ow'].apply(sched, conv, ow) cfg['reorder_0'].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc]) cfg['ann_reduce'].apply(sched, conv, [kh, kw], axis_lens=[ get_const_int(kh.dom.extent), get_const_int(kw.dom.extent) ], max_unroll=8, cfg=cfg) cfg['ann_spatial'].apply(sched, conv, [vh, vw, vc], axis_lens=[ cfg['tile_oh'].size[-1], cfg['tile_ow'].size[-1], cfg['tile_co'].size[-1] ], max_unroll=8, cfg=cfg) kernel_scope = n # this is the scope to attach global config inside this kernel # tune unroll sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
def _compute_multibox_transform_loc(attrs, inputs, _): """Compute definition of multibox_detection""" clip = bool(get_const_int(attrs.clip)) threshold = get_const_float(attrs.threshold) variances = get_float_tuple(attrs.variances) return topi_compute( inputs[0], inputs[1], inputs[2], clip, threshold, variances)
def repeat_shape_func(attrs, inputs, _): """ Shape func for repeat. """ axis = get_const_int(attrs.axis) if axis < 0: axis = inputs[0].shape[0] + axis return [_repeat_shape_func(inputs[0], attrs.repeats, convert(axis))]
def stack_shape_func(attrs, inputs, _): """ Shape func for stack. """ axis = get_const_int(attrs.axis) if axis < 0: axis += inputs[0].shape[0] + 1 return [_stack_shape_func(inputs[0], convert(axis), convert(len(inputs)))]
def _compute_multibox_prior(attrs, inputs, _): """Compute definition of multibox_prior""" sizes = get_float_tuple(attrs.sizes) ratios = get_float_tuple(attrs.ratios) steps = get_float_tuple(attrs.steps) offsets = get_float_tuple(attrs.offsets) clip = bool(get_const_int(attrs.clip)) return [topi_compute(inputs[0], sizes, ratios, steps, offsets, clip)]
def split_shape_func(attrs, inputs, _): """ Shape function for split op. """ if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)): indices_or_sections = get_const_int(attrs.indices_or_sections) else: indices_or_sections = get_const_tuple(attrs.indices_or_sections) axis = get_const_int(attrs.axis) num_out = indices_or_sections if isinstance(indices_or_sections, int) \ else len(indices_or_sections) + 1 if isinstance(indices_or_sections, int): indices_or_sections = [indices_or_sections] return [ _split_shape_func(inputs[0], convert(i), convert(indices_or_sections), convert(axis)) for i in range(num_out) ]
def take_shape_func(attrs, inputs, out_ndims): """ Shape function for take op. """ if attrs.axis is None: return [_take_no_axis_shape_func(inputs[1], out_ndims[0])] axis = get_const_int(attrs.axis) data_ndim = int(inputs[0].shape[0]) if axis < 0: axis += data_ndim assert 0 <= axis < data_ndim return [_take_with_axis_shape_func(*inputs, convert(axis), out_ndims[0])]
def _compute_proposal(attrs, inputs, out_type): scales = get_float_tuple(attrs.scales) ratios = get_float_tuple(attrs.ratios) feature_stride = attrs.feature_stride threshold = attrs.threshold rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n rpn_post_nms_top_n = attrs.rpn_post_nms_top_n rpn_min_size = attrs.rpn_min_size iou_loss = bool(get_const_int(attrs.iou_loss)) return [topi_compute(inputs[0], inputs[1], inputs[2], scales, ratios, feature_stride, threshold, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss)]
def bitpack(data, bits, pack_type="int8", name="bitpack"): """Packs lowest dimension into format needed by VTA Parameters ---------- pack_axis : int index of the axis to pack in data bit_axis : int index of axis to place bit axis in resulting packed data Returns ------- packed : Tensor The packed tensor. """ shape_vec = list(data.shape) if pack_type == 'int8': data_width = 8 elif pack_type == 'int16': data_width = 16 elif pack_type == 'int32': data_width = 32 else: raise RuntimeError("Unknown pack type %s" % pack_type) assert data_width % bits == 0 lanes = data_width // bits # Data must be in multiples of the data_width assert util.get_const_int( shape_vec[-1]) % lanes == 0, "Not a multiple of word size" shape_vec[-1] = shape_vec[-1] // lanes oshape = tuple(shape_vec) def _bitpack(*indices): ret = None mask = tvm.tir.const((1 << bits) - 1, pack_type) for k in range(lanes): idx = list(indices) idx[-1] = idx[-1] * lanes + k elem = data(*idx).astype(pack_type) if k == 0: ret = elem & mask else: val = (elem & mask) << tvm.tir.const(k * bits, pack_type) ret = ret | val return ret return te.compute(oshape, _bitpack, name=name, tag='bitpack')
def _compute_nms(attrs, inputs, out_type): max_output_size = inputs[3] if attrs.max_output_size is not None: max_output_size = attrs.max_output_size return_indices = bool(get_const_int(attrs.return_indices)) iou_threshold = get_const_float(attrs.iou_threshold) force_suppress = bool(get_const_int(attrs.force_suppress)) top_k = get_const_int(attrs.top_k) coord_start = get_const_int(attrs.coord_start) score_index = get_const_int(attrs.score_index) id_index = get_const_int(attrs.id_index) invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom)) if return_indices: return topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold, force_suppress, top_k, coord_start, score_index, id_index, return_indices, invalid_to_bottom) return [topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold, force_suppress, top_k, coord_start, score_index, id_index, return_indices, invalid_to_bottom)]
def concatenate_shape_func(attrs, inputs, _): axis = get_const_int(attrs.axis) if axis < 0: axis += inputs[0].shape[0] return [_concatenate_shape_func(inputs, convert(axis))]
def _compute_get_valid_counts(attrs, inputs, out_type): score_threshold = get_const_float(attrs.score_threshold) id_index = get_const_int(attrs.id_index) score_index = get_const_int(attrs.score_index) return topi_compute(inputs[0], score_threshold, id_index, score_index)
def test_util(): x = tvm.tir.const(100, "int32") assert util.get_const_int(x) == 100 assert util.get_const_tuple((x, x)) == (100, 100)