def _schedule(cfg, s, data_vec, weight_vec, output): s[data_vec].parallel(s[data_vec].op.axis[0]) s[weight_vec].parallel(s[weight_vec].op.axis[0]) y, x = s[output].op.axis wb, db, k = s[output].op.reduce_axis yo, yi = cfg["tile_y"].apply(s, output, y) xo, xi = cfg["tile_x"].apply(s, output, x) ko, ki = cfg["tile_k"].apply(s, output, k) cfg["reorder_0"].apply(s, output, [yo, xo, ko, yi, wb, db, ki, xi]) cfg["ann_reduce"].apply(s, output, [db, wb], axis_lens=[get_const_int(db.dom.extent), get_const_int(wb.dom.extent)], max_unroll=8, cfg=cfg) cfg["ann_spatial"].apply(s, output, [yi, xi], axis_lens=[cfg['tile_y'].size[-1], cfg['tile_x'].size[-1]], max_unroll=8, cfg=cfg) s[output].vectorize(xi) s[output].parallel(yo) return s
def _schedule_bitserial_conv2d_nhwc(cfg, s, data_q, data_pad, data_vec, kernel_q, kernel_vec, conv_out, output, last): # no stride and padding info here _, IH, IW, CI, IB = data_q.shape KH, KW, _, CO, KB = kernel_q.shape _, OH, OW, _ = output.shape VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] ##### Schedule data padding and packing if data_pad is not None: s[data_pad].compute_inline() _, h, _, _, _, _, _ = s[data_vec].op.axis cfg.define_split("tile_ah", cfg.axis(h), policy="all", num_outputs=2, max_factor=32) oh, ih = cfg["tile_ah"].apply(s, data_vec, h) s[data_vec].parallel(oh) ##### Schedule kernel packing co, _, _, _, _, _ = s[kernel_vec].op.axis cfg.define_split("tile_bco", cfg.axis(co), policy="all", num_outputs=2, max_factor=32) oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co) s[kernel_vec].parallel(oco) ##### Schedule Convolution n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) cfg["reorder_0"].apply(s, conv_out, [n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2]) cfg["ann_reduce"].apply(s, conv_out, [b1, b2, dh, dw], axis_lens=[get_const_int(b1.dom.extent), get_const_int(b2.dom.extent), get_const_int(dh.dom.extent), get_const_int(dw.dom.extent)], max_unroll=16, cfg=cfg) s[conv_out].unroll(b1) s[conv_out].unroll(b2) s[conv_out].vectorize(vc) # # Schedule output n, h, w, co = s[last].op.axis co, vc = s[last].split(co, VC) oh, ow, vh, vw = s[last].tile(h, w, VH, VW) s[last].reorder(n, oh, ow, co, vh, vw, vc) s[last].vectorize(vc) if last != output: s[output].compute_inline() s[conv_out].compute_at(s[last], ow) oho, iho = cfg["tile_oh"].apply(s, last, oh) # reuse parameter s[last].parallel(oho) return s
def compute_argsort(attrs, inputs, _, target): """Compute definition of argsort""" axis = get_const_int(attrs.axis) is_ascend = bool(get_const_int(attrs.is_ascend)) dtype = str(attrs.dtype) return [ topi.argsort(inputs[0], None, axis=axis, is_ascend=is_ascend, \ dtype=dtype, flag=False) ]
def compute_multibox_transform_loc(attrs, inputs, _, target): """Compute definition of multibox_detection""" clip = bool(get_const_int(attrs.clip)) threshold = get_const_float(attrs.threshold) variances = get_float_tuple(attrs.variances) return topi.vision.ssd.multibox_transform_loc( inputs[0], inputs[1], inputs[2], clip, threshold, variances)
def _compute_multibox_transform_loc(attrs, inputs, _): """Compute definition of multibox_detection""" clip = bool(get_const_int(attrs.clip)) threshold = get_const_float(attrs.threshold) variances = get_float_tuple(attrs.variances) return topi_compute(inputs[0], inputs[1], inputs[2], clip, threshold, variances)
def compute_conv2d(attrs, inputs, _): """Compute definition of conv2d""" padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") channels = attrs.get_int("channels") layout = attrs["layout"] assert layout == "NCHW" or layout == "NHWC" (dilation_h, dilation_w) = dilation if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") elif dilation == (1, 1): kernel = inputs[1] elif layout == "NCHW": kernel = topi.nn.dilate(inputs[1], [1, 1, dilation_h, dilation_w]) else: #layout == NHWC kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1]) if groups == 1: out = topi.nn.conv2d(inputs[0], kernel, strides, padding, layout) elif groups == get_const_int(inputs[0].shape[1]) and groups == channels: out = topi.nn.depthwise_conv2d_nchw(inputs[0], kernel, strides, padding) else: raise ValueError("not support arbitrary group number for now") if attrs.get_bool("use_bias"): bias = inputs[2] expand_axis = 1 if layout == "NCHW" else 0 bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2) out = topi.broadcast_add(out, bias) return out
def _callback(op): if op.tag == "sparse_dense_bsrmm": y_bsrmm = op.input_tensors[0] w_indptr = y_bsrmm.op.input_tensors[0] assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block" y_reshape = op (m, num_blocks, b_r) = s[y_bsrmm].op.axis bs_r = get_const_int(b_r.dom.extent) (elem_idx, c) = s[y_bsrmm].op.reduce_axis (m_o, n_o) = s[y_reshape].op.axis s[y_reshape].bind(m_o, te.thread_axis("blockIdx.x")) s[y_reshape].bind(n_o, te.thread_axis("blockIdx.y")) s[y_bsrmm].compute_at(s[y_reshape], n_o) thread_x = te.thread_axis("threadIdx.x") cfg.define_split("tile_c", c, num_outputs=2) co, ci = cfg['tile_c'].apply(s, y_bsrmm, c) y_bsrmm_factored = s.rfactor(y_bsrmm, ci) tx = s[y_bsrmm].op.reduce_axis[0] s[y_bsrmm].bind(tx, thread_x) s[y_bsrmm_factored].compute_at(s[y_bsrmm], tx) s[y_bsrmm].set_store_predicate(thread_x.var.equal(0)) s[y_reshape].set_store_predicate(thread_x.var.equal(0))
def compute_conv2d(attrs, inputs, _): """Compute definition of conv2d""" padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") channels = attrs.get_int("channels") layout = attrs["layout"] kernel_layout = attrs["kernel_layout"] out_dtype = attrs["out_dtype"] out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype assert layout in ["NCHW", "NHWC", "NCHW4c"] (dilation_h, dilation_w) = dilation if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8': # pylint: disable=assignment-from-no-return out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) # pylint: enable=assignment-from-no-return elif groups == 1: out = topi.nn.conv2d( inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) elif layout == "NCHW" and \ groups == get_const_int(inputs[0].shape[1]) and \ groups == channels: out = topi.nn.depthwise_conv2d_nchw( inputs[0], inputs[1], strides, padding, dilation, out_dtype) elif layout in ["NCHW", "NCHW4c"]: out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype) elif layout == "NHWC" and \ kernel_layout == "HWOI" and \ groups == get_const_int(inputs[0].shape[3]) and \ groups == channels: out = topi.nn.depthwise_conv2d_nhwc( inputs[0], inputs[1], strides, padding, dilation, out_dtype) else: raise ValueError("not support arbitrary group number for now") if attrs.get_bool("use_bias"): bias = inputs[2] expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0 bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2) out = topi.add(out, bias) return out
def compute_conv2d(attrs, inputs, out_type, target): """Compute definition of conv2d""" padding = get_const_tuple(attrs.padding) strides = get_const_tuple(attrs.strides) dilation = get_const_tuple(attrs.dilation) groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout out_dtype = attrs.out_dtype out_dtype = (inputs[0].dtype if out_dtype in ("same", "") else out_dtype) assert layout in ["NCHW", "NHWC", "NCHW4c"] (dilation_h, dilation_w) = dilation if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype) elif layout == "NCHW" and \ kernel_layout == "OIHW" and \ get_const_int(inputs[1].shape[0]) == groups and \ get_const_int(inputs[1].shape[1]) == 1: out = topi.nn.depthwise_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) elif layout == "NHWC" and \ kernel_layout == "HWOI" and\ get_const_int(inputs[1].shape[2]) == groups and \ get_const_int(inputs[1].shape[3]) == 1: out = topi.nn.depthwise_conv2d_nhwc(inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) else: raise ValueError("not support arbitrary group number for now") return [out]
def compute_conv2d(attrs, inputs, _): """Compute definition of conv2d""" padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") channels = attrs.get_int("channels") layout = attrs["layout"] kernel_layout = attrs["kernel_layout"] out_dtype = attrs["out_dtype"] out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype assert layout in ["NCHW", "NHWC", "NCHW4c"] (dilation_h, dilation_w) = dilation if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8': # pylint: disable=assignment-from-no-return out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype) # pylint: enable=assignment-from-no-return elif groups == 1: out = topi.nn.conv2d( inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype) elif layout == "NCHW" and \ groups == get_const_int(inputs[0].shape[1]) and \ groups == channels: out = topi.nn.depthwise_conv2d_nchw( inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) elif layout in ["NCHW", "NCHW4c"]: out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype=out_dtype) elif layout == "NHWC" and \ kernel_layout == "HWOI" and \ groups == get_const_int(inputs[0].shape[3]) and \ groups == channels: out = topi.nn.depthwise_conv2d_nhwc( inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) else: raise ValueError("not support arbitrary group number for now") if attrs.get_bool("use_bias"): bias = inputs[2] expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0 bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2) out = topi.add(out, bias) return out
def _compute_multibox_prior(attrs, inputs, _): """Compute definition of multibox_prior""" sizes = get_float_tuple(attrs.sizes) ratios = get_float_tuple(attrs.ratios) steps = get_float_tuple(attrs.steps) offsets = get_float_tuple(attrs.offsets) clip = bool(get_const_int(attrs.clip)) return [topi_compute(inputs[0], sizes, ratios, steps, offsets, clip)]
def compute_decode_BBox(attrs, inputs, _, target): """Compute definition of proposal""" bbox_mean = get_float_tuple(attrs.bbox_mean) bbox_std = get_float_tuple(attrs.bbox_std) class_agnostic = bool(get_const_int(attrs.class_agnostic)) with target: return [ topi.vision.rcnn.decode_BBox(inputs[0], inputs[1], inputs[2], bbox_mean, bbox_std, class_agnostic) ]
def split_shape_func(attrs, inputs, _): """ Shape function for split op. """ if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)): indices_or_sections = get_const_int(attrs.indices_or_sections) else: indices_or_sections = get_const_tuple(attrs.indices_or_sections) axis = get_const_int(attrs.axis) num_out = indices_or_sections if isinstance(indices_or_sections, int) \ else len(indices_or_sections) + 1 if isinstance(indices_or_sections, int): indices_or_sections = [indices_or_sections] return [ _split_shape_func(inputs[0], convert(i), convert(indices_or_sections), convert(axis)) for i in range(num_out) ]
def decl_winograd(data, U, stride, padding, out_dtype): """declare winograd fast convolution F(2x2, 3x3) for conv2d""" N, C, H, W = [util.get_const_int(x) for x in data.shape] _, _, C, K = [util.get_const_int(x) for x in U.shape] HPAD, WPAD = 1, 1 if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") m = 2 r = 3 alpha = m + r - 1 K = K nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW # pack input tile input_tile = tvm.compute( (C, P, alpha, alpha), lambda c, b, eps, nu: tvm.select( b < P, data_pad[b // (nH * nW)][c][b // nW % nH * m + eps][ b % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d') V = decl_V_minimal(input_tile, alpha, C, P) # batch gemm c = tvm.reduce_axis((0, C), name='c') M = tvm.compute( (alpha, alpha, K, P), lambda eps, nu, k, b: tvm.sum(U[eps][nu][c][k] * V[eps][nu][c][b], axis=c), name='M') # inverse transform and unpack output = decl_output_minimal(M, N, K, H, W, P, m, nH, nW) return output
def compute_multibox_prior(attrs, inputs, _, target): """Compute definition of multibox_prior""" sizes = get_float_tuple(attrs.sizes) ratios = get_float_tuple(attrs.ratios) steps = get_float_tuple(attrs.steps) offsets = get_float_tuple(attrs.offsets) clip = bool(get_const_int(attrs.clip)) return [ topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios, steps, offsets, clip) ]
def _compute_nms(attrs, inputs, out_type): return_indices = bool(get_const_int(attrs.return_indices)) max_output_size = get_const_int(attrs.max_output_size) iou_threshold = get_const_float(attrs.iou_threshold) force_suppress = bool(get_const_int(attrs.force_suppress)) top_k = get_const_int(attrs.top_k) coord_start = get_const_int(attrs.coord_start) score_index = get_const_int(attrs.score_index) id_index = get_const_int(attrs.id_index) invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom)) return [topi_compute(inputs[0], inputs[1], max_output_size, iou_threshold, force_suppress, top_k, coord_start, score_index, id_index, return_indices, invalid_to_bottom)]
def _callback(op): if op.tag == "sparse_dense_bsrmm": y_bsrmm = op.input_tensors[0] assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block" y_reshape = op (m, num_blocks, b_r) = s[y_bsrmm].op.axis bs_r = get_const_int(b_r.dom.extent) (elem_idx, c) = s[y_bsrmm].op.reduce_axis s[y_reshape].bind(s[y_reshape].op.axis[0], te.thread_axis("blockIdx.x")) s[y_reshape].bind(s[y_reshape].op.axis[1], te.thread_axis("threadIdx.x")) s[y_bsrmm].compute_at(s[y_reshape], s[y_reshape].op.axis[1])
def _compute_proposal(attrs, inputs, out_type): scales = get_float_tuple(attrs.scales) ratios = get_float_tuple(attrs.ratios) feature_stride = attrs.feature_stride threshold = attrs.threshold rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n rpn_post_nms_top_n = attrs.rpn_post_nms_top_n rpn_min_size = attrs.rpn_min_size iou_loss = bool(get_const_int(attrs.iou_loss)) return [topi_compute(inputs[0], inputs[1], inputs[2], scales, ratios, feature_stride, threshold, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss)]
def take_shape_func(attrs, inputs, out_ndims): """ Shape function for take op. """ if attrs.axis is None: return [_take_no_axis_shape_func(inputs[1], out_ndims[0])] axis = get_const_int(attrs.axis) data_ndim = int(inputs[0].shape[0]) if axis < 0: axis += data_ndim assert 0 <= axis < data_ndim return [_take_with_axis_shape_func(*inputs, convert(axis), out_ndims[0])]
def compute_conv2d(attrs, inputs, out_type, target): """Compute definition of conv2d""" padding = get_const_tuple(attrs.padding) strides = get_const_tuple(attrs.strides) dilation = get_const_tuple(attrs.dilation) groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout out_dtype = attrs.out_dtype out_dtype = (inputs[0].dtype if out_dtype in ("same", "") else out_dtype) assert layout in ["NCHW", "NHWC", "NCHW4c"] (dilation_h, dilation_w) = dilation if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: out = topi.nn.conv2d( inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype) elif layout == "NCHW" and \ get_const_int(inputs[1].shape[0]) == groups and \ get_const_int(inputs[1].shape[1]) == 1: out = topi.nn.depthwise_conv2d_nchw( inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) elif layout == "NHWC" and \ kernel_layout == "HWOI" and\ get_const_int(inputs[1].shape[2]) == groups and \ get_const_int(inputs[1].shape[3]) == 1: out = topi.nn.depthwise_conv2d_nhwc( inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype) elif layout in ['NCHW', 'NCHW4c']: out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, out_dtype=out_dtype) else: raise ValueError("not support arbitrary group number for now") return [out]
def compute_nms(attrs, inputs, _, target): """Compute definition of nms""" return_indices = bool(get_const_int(attrs.return_indices)) max_output_size = get_const_int(attrs.max_output_size) iou_threshold = get_const_float(attrs.iou_threshold) force_suppress = bool(get_const_int(attrs.force_suppress)) top_k = get_const_int(attrs.top_k) coord_start = get_const_int(attrs.coord_start) score_index = get_const_int(attrs.score_index) id_index = get_const_int(attrs.id_index) invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom)) return [ topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size, iou_threshold, force_suppress, top_k, coord_start, score_index, id_index, return_indices, invalid_to_bottom) ]
def bitpack(data, bits, pack_type="int8", name="bitpack"): """Packs lowest dimension into format needed by VTA Parameters ---------- pack_axis : int index of the axis to pack in data bit_axis : int index of axis to place bit axis in resulting packed data Returns ------- packed : Tensor The packed tensor. """ shape_vec = list(data.shape) if pack_type == 'int8': data_width = 8 elif pack_type == 'int16': data_width = 16 elif pack_type == 'int32': data_width = 32 else: raise RuntimeError("Unknown pack type %s" % pack_type) assert data_width % bits == 0 lanes = data_width // bits # Data must be in multiples of the data_width assert util.get_const_int( shape_vec[-1]) % lanes == 0, "Not a multiple of word size" shape_vec[-1] = shape_vec[-1] // lanes oshape = tuple(shape_vec) def _bitpack(*indices): ret = None mask = tvm.const((1 << bits) - 1, pack_type) for k in range(lanes): idx = list(indices) idx[-1] = idx[-1] * lanes + k elem = data(*idx).astype(pack_type) if k == 0: ret = elem & mask else: val = (elem & mask) << tvm.const(k * bits, pack_type) ret = ret | val return ret return tvm.compute(oshape, _bitpack, name=name, tag='bitpack')
def compute_proposal(attrs, inputs, _, target): """Compute definition of proposal""" scales = get_float_tuple(attrs.scales) ratios = get_float_tuple(attrs.ratios) feature_stride = attrs.feature_stride threshold = attrs.threshold rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n rpn_post_nms_top_n = attrs.rpn_post_nms_top_n rpn_min_size = attrs.rpn_min_size iou_loss = bool(get_const_int(attrs.iou_loss)) with target: return [ topi.vision.rcnn.proposal(inputs[0], inputs[1], inputs[2], scales, ratios, feature_stride, threshold, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss) ]
def compute_proposal(attrs, inputs, _, target): """Compute definition of proposal""" scales = get_float_tuple(attrs.scales) ratios = get_float_tuple(attrs.ratios) feature_stride = attrs.feature_stride threshold = attrs.threshold rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n rpn_post_nms_top_n = attrs.rpn_post_nms_top_n rpn_min_size = attrs.rpn_min_size iou_loss = bool(get_const_int(attrs.iou_loss)) with target: return [ topi.vision.rcnn.proposal(inputs[0], inputs[1], inputs[2], scales, ratios, feature_stride, threshold, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss) ]
def compute_conv2d(attrs, inputs, _): """Compute definition of conv2d""" padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") channels = attrs.get_int("channels") layout = attrs["layout"] assert layout == "NCHW", "only support nchw for now" assert dilation == (1, 1), "not support dilate now" if groups == 1: out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding) elif groups == get_const_int(inputs[0].shape[1]) and groups == channels: out = topi.nn.depthwise_conv2d_nchw(inputs[0], inputs[1], strides, padding) else: raise ValueError("not support arbitrary group number for now") if attrs.get_bool("use_bias"): bias = inputs[2] bias = topi.expand_dims(bias, axis=1, num_newaxis=2) out = topi.broadcast_add(out, bias) return out
def compute_conv2d(attrs, inputs, _): """Compute definition of conv2d""" padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") channels = attrs.get_int("channels") layout = attrs["layout"] assert layout == "NCHW", "only support nchw for now" assert dilation == (1, 1), "not support dilate now" if groups == 1: out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding) elif groups == get_const_int(inputs[0].shape[1]) and groups == channels: out = topi.nn.depthwise_conv2d_nchw(inputs[0], inputs[1], strides, padding) else: raise ValueError("not support arbitrary group number for now") if attrs.get_bool("use_bias"): bias = inputs[2] bias = topi.expand_dims(bias, axis=1, num_newaxis=2) out = topi.broadcast_add(out, bias) return out
def concatenate_shape_func(attrs, inputs, _): axis = get_const_int(attrs.axis) if axis < 0: axis += inputs[0].shape[0] return [_concatenate_shape_func(inputs, convert(axis))]
def test_util(): x = tvm.const(100) assert util.get_const_int(x) == 100 assert util.get_const_tuple((x, x)) == (100, 100)
def decl_winograd(data, U, stride, padding, out_dtype): """declare winograd fast convolution F(2x2, 3x3) for conv2d""" N, C, H, W = [util.get_const_int(x) for x in data.shape] _, _, C, K = [util.get_const_int(x) for x in U.shape] HPAD, WPAD = 1, 1 if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad") B_data = np.array( [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]], out_dtype) A_data = np.array([ [1, 0], [1, 1], [1, -1], [0, -1], ], out_dtype) m = 2 r = 3 alpha = m + r - 1 K = K nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW # pack input tile input_tile = tvm.compute( (C, P, alpha, alpha), lambda c, b, eps, nu: tvm.select( b < P, data_pad[b // (nH * nW)][c][b // nW % nH * m + eps][ b % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d') # transform image B = const_array(B_data, 'B') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') V = tvm.compute((alpha, alpha, C, P), lambda eps, nu, c, b: tvm.sum(input_tile[c][b][r_eps][ r_nu] * B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V') # batch gemm c = tvm.reduce_axis((0, C), name='c') M = tvm.compute( (alpha, alpha, K, P), lambda eps, nu, k, b: tvm.sum(U[eps][nu][c][k] * V[eps][nu][c][b], axis=c), name='M') # inverse transform and unpack A = const_array(A_data, 'A') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') output = tvm.compute( (N, K, H, W), lambda n, k, h, w: tvm.sum(M[r_eps][r_nu][k][n * nH * nW + ( h // m) * nW + w // m] * A[r_eps][h % m] * A[r_nu][w % m], axis=[r_eps, r_nu]), name='output') return output
def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last): IB, _, CI, IH, IW = data_q.shape KB, CO, _, KH, KW = kernel_q.shape _, _, OH, OW = output.shape # Infer padding and stride if data_pad is None: padding = (0, 0) TH, TW = IH, IW else: _, _, _, TH, TW = data_pad.shape hpad = get_const_int((TH - IH) // 2) wpad = get_const_int((TW - IW) // 2) padding = (hpad, wpad) hstride = get_const_int((TH - KH) // (OH - 1)) wstride = get_const_int((TW - KW) // (OW - 1)) stride = (hstride, wstride) wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW") sch = _get_schedule(wkl, "NCHW") VH = sch.vh VW = sch.vw VC = sch.vc ba = sch.ba bc = sch.bc CC = s.cache_write(conv_out, "global") n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis s[conv_out].vectorize(vc) s[CC].compute_at(s[conv_out], ow) n, co, oh, ow, vh, vw, vc = s[CC].op.axis ci, dh, dw, b1, b2 = s[CC].op.reduce_axis s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc) s[CC].unroll(b1) s[CC].unroll(b2) s[CC].vectorize(vc) ##### Schedule A if data_pad is not None: s[data_pad].compute_inline() _, h, _, _, _, _, vw = s[data_vec].op.axis s[data_vec].vectorize(vw) if ba == 1: oaxis = h paxis = h else: oh, ih = s[data_vec].split(h, ba) oaxis = oh paxis = ih s[data_vec].parallel(paxis) s[data_vec].pragma(oaxis, "parallel_launch_point") s[data_vec].pragma(paxis, "parallel_stride_pattern") s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule B co, _, _, _, _, vc = s[kernel_vec].op.axis s[kernel_vec].vectorize(vc) if bc == 1: oaxis = co paxis = co else: oco, ico = s[kernel_vec].split(co, bc) oaxis = oco paxis = ico s[kernel_vec].parallel(paxis) s[kernel_vec].pragma(oaxis, "parallel_launch_point") s[kernel_vec].pragma(paxis, "parallel_stride_pattern") s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule C n, co, h, w = s[last].op.axis co, vc = s[last].split(co, VC) oh, ow, vh, vw = s[last].tile(h, w, VH, VW) s[last].reorder(n, co, oh, ow, vh, vw, vc) if last != output: s[output].compute_inline() s[conv_out].compute_at(s[last], ow) if bc == 1: oaxis = co paxis = co else: oco, ico = s[last].split(co, bc) oaxis = oco paxis = ico s[last].parallel(paxis) s[last].pragma(oaxis, "parallel_launch_point") s[last].pragma(paxis, "parallel_stride_pattern") s[last].pragma(oaxis, "parallel_barrier_when_finish") return s
def _schedule_bitserial_conv2d_nhwc(cfg, s, data_q, data_pad, data_vec, kernel_q, kernel_vec, conv_out, output, last): # no stride and padding info here _, IH, IW, CI, IB = data_q.shape KH, KW, _, CO, KB = kernel_q.shape _, OH, OW, _ = output.shape VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] ##### Schedule data padding and packing if data_pad is not None: s[data_pad].compute_inline() _, h, _, _, _, _, _ = s[data_vec].op.axis cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32) oh, ih = cfg["tile_ah"].apply(s, data_vec, h) s[data_vec].parallel(oh) ##### Schedule kernel packing co, _, _, _, _, _ = s[kernel_vec].op.axis cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32) oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co) s[kernel_vec].parallel(oco) ##### Schedule Convolution n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) cfg["reorder_0"].apply(s, conv_out, [n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2]) cfg["ann_reduce"].apply(s, conv_out, [b1, b2, dh, dw], axis_lens=[ get_const_int(b1.dom.extent), get_const_int(b2.dom.extent), get_const_int(dh.dom.extent), get_const_int(dw.dom.extent) ], max_unroll=16, cfg=cfg) s[conv_out].unroll(b1) s[conv_out].unroll(b2) s[conv_out].vectorize(vc) # # Schedule output n, h, w, co = s[last].op.axis co, vc = s[last].split(co, VC) oh, ow, vh, vw = s[last].tile(h, w, VH, VW) s[last].reorder(n, oh, ow, co, vh, vw, vc) s[last].vectorize(vc) if last != output: s[output].compute_inline() s[conv_out].compute_at(s[last], ow) oho, iho = cfg["tile_oh"].apply(s, last, oh) # reuse parameter s[last].parallel(oho) return s
def _compute_get_valid_counts(attrs, inputs, out_type): score_threshold = get_const_float(attrs.score_threshold) id_index = get_const_int(attrs.id_index) score_index = get_const_int(attrs.score_index) return topi_compute(inputs[0], score_threshold, id_index, score_index)
def test_util(): x = tvm.const(100, "int32") assert util.get_const_int(x) == 100 assert util.get_const_tuple((x, x)) == (100, 100)
def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last): # no stride and padding info here _, IH, IW, CI, IB = data_q.shape KH, KW, _, CO, KB = kernel_q.shape _, OH, OW, _ = output.shape # Infer padding and stride if data_pad is None: padding = (0, 0) TH, TW = IH, IW else: _, TH, TW, _, _ = data_pad.shape hpad = get_const_int((TH - IH) // 2) wpad = get_const_int((TW - IW) // 2) padding = (hpad, wpad) hstride = get_const_int((TH - KH) // (OH - 1)) wstride = get_const_int((TW - KW) // (OW - 1)) stride = (hstride, wstride) wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NHWC") sch = _get_schedule(wkl, "NHWC") VH = sch.vh VW = sch.vw VC = sch.vc ba = sch.ba bc = sch.bc ##### Schedule data packing if data_pad is not None: s[data_pad].compute_inline() _, h, _, _, _, _, _ = s[data_vec].op.axis if ba == 1: oaxis = h paxis = h else: oh, ih = s[data_vec].split(h, ba) oaxis = oh paxis = ih s[data_vec].parallel(paxis) s[data_vec].pragma(oaxis, "parallel_launch_point") s[data_vec].pragma(paxis, "parallel_stride_pattern") s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule kernel packing co, _, _, _, _, _ = s[kernel_vec].op.axis if bc == 1: oaxis = co paxis = co else: oco, ico = s[kernel_vec].split(co, bc) oaxis = oco paxis = ico s[kernel_vec].parallel(paxis) s[kernel_vec].pragma(oaxis, "parallel_launch_point") s[kernel_vec].pragma(paxis, "parallel_stride_pattern") s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule Convolution n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) s[conv_out].unroll(b1) s[conv_out].unroll(b2) s[conv_out].vectorize(vc) # # Schedule output n, h, w, co = s[last].op.axis co, vc = s[last].split(co, VC) oh, ow, vh, vw = s[last].tile(h, w, VH, VW) s[last].reorder(n, oh, ow, co, vh, vw, vc) s[last].vectorize(vc) if last != output: s[output].compute_inline() s[conv_out].compute_at(s[last], ow) if bc == 1: oaxis = oh paxis = oh else: oho, iho = s[last].split(oh, bc) oaxis = oho paxis = iho s[last].parallel(paxis) s[last].pragma(oaxis, "parallel_launch_point") s[last].pragma(paxis, "parallel_stride_pattern") s[last].pragma(oaxis, "parallel_barrier_when_finish") return s
def decl_output_transform(cfg, X, M, VK, VP): N = get_const_int(X.shape[0]) IH = get_const_int(X.shape[2]) IW = get_const_int(X.shape[3]) alpha = get_const_int(M.shape[0]) K = get_const_int(M.shape[0]) * get_const_int(M.shape[4]) P = get_const_int(M.shape[1]) * get_const_int(M.shape[5]) # inverse transform A = const_matrix(A_data, 'A') r_eps = tvm.reduce_axis((0, alpha), 'r_eps') r_nu = tvm.reduce_axis((0, alpha), 'r_nu') Y = tvm.compute((K // VK, P // VP, m, m, VK, VP), lambda k, b, vh, vw, kk, bb: tvm.sum(M[k][b][r_eps][r_nu][ kk][bb] * A[r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]), name='Y') OH = get_const_int((IH + 2 * HPAD - 3) // HSTR + 1) OW = get_const_int((IW + 2 * WPAD - 3) // WSTR + 1) nH, nW = get_const_int((OH + m - 1) // m), get_const_int((OW + m - 1) // m) # unpack output def _output(n, k, h, w): k_elem = k % VK k_tile = k // VK b = n * nH * nW + h // m * nW + w // m b_elem = b % VP b_tile = b // VP return Y[k_tile][b_tile][h % m][w % m][k_elem][b_elem] output = tvm.compute((N, K, OH, OW), _output, name='output', tag='winograd_conv_output') return output
def _schedule_bitserial_conv2d_nchw(cfg, s, data_q, data_pad, data_vec, kernel_q, kernel_vec, conv_out, output, last): IB, _, CI, IH, IW = data_q.shape KB, CO, _, KH, KW = kernel_q.shape _, _, OH, OW = output.shape # Infer padding and stride if data_pad is None: padding = (0, 0) TH, TW = IH, IW else: _, _, _, TH, TW = data_pad.shape hpad = get_const_int((TH - IH) // 2) wpad = get_const_int((TW - IW) // 2) padding = (hpad, wpad) hstride = get_const_int((TH - KH) // (OH - 1)) wstride = get_const_int((TW - KW) // (OW - 1)) stride = (hstride, wstride) VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] ##### Schedule Data padding, and bitpacking if data_pad is not None: s[data_pad].compute_inline() _, _, h, _, _, _, _ = s[data_vec].op.axis cfg.define_split("tile_ah", cfg.axis(h), policy="all", num_outputs=2, max_factor=32) oh, ih = cfg["tile_ah"].apply(s, data_vec, h) if cfg["tile_ah"].size[1] == 1: oaxis = oh paxis = oh else: oaxis = oh paxis = ih s[data_vec].parallel(paxis) s[data_vec].pragma(oaxis, "parallel_launch_point") s[data_vec].pragma(paxis, "parallel_stride_pattern") s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule Kenerl bitpacking co, _, _, _, _, _ = s[kernel_vec].op.axis cfg.define_split("tile_bco", cfg.axis(co), policy="all", num_outputs=2, max_factor=32) oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co) if cfg["tile_bco"].size[1] == 1: oaxis = oco paxis = oco else: oaxis = oco paxis = ico s[kernel_vec].parallel(paxis) s[kernel_vec].pragma(oaxis, "parallel_launch_point") s[kernel_vec].pragma(paxis, "parallel_stride_pattern") s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule Convolution n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis ci, dh, dw, ib, kb = s[conv_out].op.reduce_axis # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) cfg["reorder_0"].apply(s, conv_out, [n, co, oh, ow, vc, vh, vw, dh, dw, kb, ib, ci]) cfg["ann_reduce"].apply(s, conv_out, [kb, ib, dh, dw], axis_lens=[get_const_int(kb.dom.extent), get_const_int(ib.dom.extent), get_const_int(dh.dom.extent), get_const_int(dw.dom.extent)], max_unroll=16, cfg=cfg) s[conv_out].vectorize(vc) # # Schedule output n, co, h, w = s[last].op.axis co, vc = s[last].split(co, VC) oh, ow, vh, vw = s[last].tile(h, w, VH, VW) s[last].reorder(n, co, oh, ow, vh, vw, vc) if last != output: s[output].compute_inline() s[conv_out].compute_at(s[last], ow) oco, ico = cfg["tile_oh"].apply(s, last, co) if cfg["tile_oh"].size[1] == 1: oaxis = oco paxis = oco else: oco, ico = s[last].split(co, bc) oaxis = oco paxis = ico s[last].parallel(oco) return s
def decl_output_transform_minimal(cfg, X, M, VK, VP): def compute_A_T_dot_M(k, b, eps, nu, kk, bb): temp_expr = {} for j in range(alpha): m1_add_m2 = M[k][b][1][j][kk][bb] + M[k][b][2][j][kk][bb] m1_sub_m2 = M[k][b][1][j][kk][bb] - M[k][b][2][j][kk][bb] m3_add_m4 = M[k][b][3][j][kk][bb] + M[k][b][4][j][kk][bb] m3_sub_m4 = M[k][b][3][j][kk][bb] - M[k][b][4][j][kk][bb] m5_add_m6 = M[k][b][5][j][kk][bb] + M[k][b][6][j][kk][bb] m5_sub_m6 = M[k][b][5][j][kk][bb] - M[k][b][6][j][kk][bb] s0 = M[k][b][0][j][kk][bb] + m1_add_m2 s5 = M[k][b][7][j][kk][bb] + m1_sub_m2 s1 = m1_sub_m2 + m5_sub_m6 * 16 s4 = m1_add_m2 + m3_add_m4 * 16 s2 = m1_add_m2 + 8 * m5_add_m6 s3 = m1_sub_m2 + 8 * m3_sub_m4 s0 = s0 + m5_add_m6 * 32 s5 = s5 + m3_sub_m4 * 32 s1 = s1 + m3_sub_m4 * 2 s4 = s4 + m5_add_m6 * 2 s0 = s0 + m3_add_m4 s5 = s5 + m5_sub_m6 s2 = s2 + m3_add_m4 * 4 s3 = s3 + m5_sub_m6 * 4 temp_expr[(0, j)] = s0 temp_expr[(1, j)] = s1 temp_expr[(2, j)] = s2 temp_expr[(3, j)] = s3 temp_expr[(4, j)] = s4 temp_expr[(5, j)] = s5 now = tvm.const(0.0, "float32") for ii in range(m): for jj in range(alpha): now = tvm.select(tvm.all(eps == ii, nu == jj), temp_expr[(ii, jj)], now) return now N = get_const_int(X.shape[0]) IH = get_const_int(X.shape[2]) IW = get_const_int(X.shape[3]) alpha = get_const_int(M.shape[0]) K = get_const_int(M.shape[0]) * get_const_int(M.shape[4]) P = get_const_int(M.shape[1]) * get_const_int(M.shape[5]) A_T_dot_M = tvm.compute((K // VK, P // VP, m, alpha, VK, VP), compute_A_T_dot_M, name="A_T_dot_M") def compute_X_dot_A(k, b, eps, nu, kk, bb): temp_expr = {} for i in range(m): m1_add_m2 = A_T_dot_M[k][b][i][1][kk][bb] + A_T_dot_M[k][b][i][2][ kk][bb] m1_sub_m2 = A_T_dot_M[k][b][i][1][kk][bb] - A_T_dot_M[k][b][i][2][ kk][bb] m3_add_m4 = A_T_dot_M[k][b][i][3][kk][bb] + A_T_dot_M[k][b][i][4][ kk][bb] m3_sub_m4 = A_T_dot_M[k][b][i][3][kk][bb] - A_T_dot_M[k][b][i][4][ kk][bb] m5_add_m6 = A_T_dot_M[k][b][i][5][kk][bb] + A_T_dot_M[k][b][i][6][ kk][bb] m5_sub_m6 = A_T_dot_M[k][b][i][5][kk][bb] - A_T_dot_M[k][b][i][6][ kk][bb] s0 = A_T_dot_M[k][b][i][0][kk][bb] + m1_add_m2 s5 = A_T_dot_M[k][b][i][7][kk][bb] + m1_sub_m2 s1 = m1_sub_m2 + m5_sub_m6 * 16 s4 = m1_add_m2 + m3_add_m4 * 16 s2 = m1_add_m2 + 8 * m5_add_m6 s3 = m1_sub_m2 + 8 * m3_sub_m4 s0 = s0 + m5_add_m6 * 32 s5 = s5 + m3_sub_m4 * 32 s1 = s1 + m3_sub_m4 * 2 s4 = s4 + m5_add_m6 * 2 s0 = s0 + m3_add_m4 s5 = s5 + m5_sub_m6 s2 = s2 + m3_add_m4 * 4 s3 = s3 + m5_sub_m6 * 4 temp_expr[(i, 0)] = s0 temp_expr[(i, 1)] = s1 temp_expr[(i, 2)] = s2 temp_expr[(i, 3)] = s3 temp_expr[(i, 4)] = s4 temp_expr[(i, 5)] = s5 now = tvm.const(0.0, "float32") for ii in range(m): for jj in range(m): now = tvm.select(tvm.all(eps == ii, nu == jj), temp_expr[(ii, jj)], now) return now Y = tvm.compute((K // VK, P // VP, m, m, VK, VP), compute_X_dot_A, name="Y") OH = get_const_int((IH + 2 * HPAD - 3) // HSTR + 1) OW = get_const_int((IW + 2 * WPAD - 3) // WSTR + 1) nH, nW = get_const_int((OH + m - 1) // m), get_const_int((OW + m - 1) // m) # unpack output def _output(n, k, h, w): k_elem = k % VK k_tile = k // VK b = n * nH * nW + h // m * nW + w // m b_elem = b % VP b_tile = b // VP return Y[k_tile][b_tile][h % m][w % m][k_elem][b_elem] output = tvm.compute((N, K, OH, OW), _output, name='output', tag='winograd_conv_output') return output
def schedule_winograd(cfg, output, VK=6, VP=8): s = tvm.create_schedule(output.op) if not cfg: return s if output.name == "Y": Y = output else: Y = output.op.input_tensors[0] A_T_dot_M = Y.op.input_tensors[0] M = A_T_dot_M.op.input_tensors[0] U, V = M.op.input_tensors B_T_dot_X = V.op.input_tensors[0] #input_tile = B_T_dot_X.op.input_tensors[0] #data_pad = input_tile.op.input_tensors[0] # padding UNROLL = cfg['unroll'].val VECTORIZE = cfg['vectorize'].val TENSORIZE = cfg['tensorize'].val #if cfg['data_pad_inline'].val: # s[data_pad].compute_inline() ## pack input tiles #(b, c, eps, nu, bb) = input_tile.op.axis #if cfg['input_tile_REORDER_C'].val: # s[input_tile].reorder(b, eps, nu, c, bb) #if UNROLL: # [s[input_tile].unroll(ax) for ax in [eps, nu]] #if VECTORIZE: # s[input_tile].vectorize(bb) #if autotvm.GLOBAL_SCOPE.in_tuning: # s[input_tile].pragma(b, 'debug_skip_region') # s[data_pad].pragma(data_pad.op.axis[0], 'debug_skip_region') # s[input_tile].compute_inline() # transform kernel #if isinstance(U.op, tvm.tensor.ComputeOp): # kernel, G = U.op.input_tensors # if isinstance(kernel.op, tvm.tensor.ComputeOp): # s[kernel].compute_inline() # s[G].compute_inline() # k, eps, nu, c, kk, = s[U].op.axis # # r_kh, r_kw = s[U].op.reduce_axis # # s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk) # # s[U].unroll(eps) # # s[U].unroll(nu) # # s[U].unroll(r_kh) # # s[U].unroll(r_kw) # # s[U].vectorize(kk) # if autotvm.GLOBAL_SCOPE.in_tuning: # # kernel transformation will be pre-computed during compilation, so we skip # # this part to make tuning records correct # s[U].pragma(k, 'debug_skip_region') # if autotvm.GLOBAL_SCOPE.in_tuning: # # kernel transformation will be pre-computed during compilation, so we skip # # this part to make tuning records correct # s[output].pragma(s[output].axis[0], 'debug_skip_region') (k, b, eps, nu, kk, bb) = A_T_dot_M.op.axis s[A_T_dot_M].reorder(b, k, eps, nu, kk, bb) if UNROLL: [s[A_T_dot_M].unroll(ax) for ax in [eps, nu, kk]] if VECTORIZE: s[A_T_dot_M].vectorize(bb) if cfg['M_COMPUTE_AT'].val: s[M].compute_at(s[A_T_dot_M], b) (k, b, eps, nu, kk, bb) = Y.op.axis s[Y].reorder(b, k, eps, nu, kk, bb) if UNROLL: [s[Y].unroll(ax) for ax in [eps, nu, kk]] if VECTORIZE: s[Y].vectorize(bb) if cfg['A_T_dot_M_COMPUTE_AT'].val: s[A_T_dot_M].compute_at(s[Y], b) # Schedule V (b, c, eps, nu, bb) = B_T_dot_X.op.axis if UNROLL: [s[B_T_dot_X].unroll(ax) for ax in [eps, nu]] if VECTORIZE: s[B_T_dot_X].vectorize(bb) # if cfg['B_T_dot_X_REORDER_C'].val: # s[B_T_dot_X].reorder(b, eps, nu, c, bb) #if cfg['input_tile_COMPUTE_AT'].val: # s[input_tile].compute_at(s[B_T_dot_X], b) (b, eps, nu, c, bb) = V.op.axis if UNROLL: [s[V].unroll(ax) for ax in [eps, nu]] if VECTORIZE: s[V].vectorize(bb) if cfg['V_REORDER_C'].val: s[V].reorder(b, eps, nu, c, bb) if cfg['B_T_dot_X_COMPUTE_AT'].val: s[B_T_dot_X].compute_at(s[V], b) (k, b, eps, nu, kk, bb) = M.op.axis if cfg['V_COMPUTE_AT'].val: s[V].compute_at(s[M], b) s[M].reorder(b, k, eps, nu, kk, bb) K = get_const_int(M.op.reduce_axis[0].dom.extent) s[M].tensorize(kk, intrin_gemm(M=VK, N=VP, K=K)) return s
def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last): IB, _, CI, IH, IW = data_q.shape KB, CO, _, KH, KW = kernel_q.shape _, _, OH, OW = output.shape # Infer padding and stride if data_pad is None: padding = (0, 0) TH, TW = IH, IW else: _, _, _, TH, TW = data_pad.shape hpad = get_const_int((TH - IH) // 2) wpad = get_const_int((TW - IW) // 2) padding = (hpad, wpad) hstride = get_const_int((TH - KH) // (OH - 1)) wstride = get_const_int((TW - KW) // (OW - 1)) stride = (hstride, wstride) wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW") sch = _get_schedule(wkl, "NCHW") VH = sch.vh VW = sch.vw VC = sch.vc ba = sch.ba bc = sch.bc CC = s.cache_write(conv_out, "global") n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis s[conv_out].vectorize(vc) s[CC].compute_at(s[conv_out], ow) n, co, oh, ow, vh, vw, vc = s[CC].op.axis ci, dh, dw, b1, b2 = s[CC].op.reduce_axis s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc) s[CC].unroll(b1) s[CC].unroll(b2) s[CC].vectorize(vc) ##### Schedule A if data_pad is not None: s[data_pad].compute_inline() _, h, _, _, _, _, vw = s[data_vec].op.axis s[data_vec].vectorize(vw) if ba == 1: oaxis = h paxis = h else: oh, ih = s[data_vec].split(h, ba) oaxis = oh paxis = ih s[data_vec].parallel(paxis) s[data_vec].pragma(oaxis, "parallel_launch_point") s[data_vec].pragma(paxis, "parallel_stride_pattern") s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule B co, _, _, _, _, vc = s[kernel_vec].op.axis s[kernel_vec].vectorize(vc) if bc == 1: oaxis = co paxis = co else: oco, ico = s[kernel_vec].split(co, bc) oaxis = oco paxis = ico s[kernel_vec].parallel(paxis) s[kernel_vec].pragma(oaxis, "parallel_launch_point") s[kernel_vec].pragma(paxis, "parallel_stride_pattern") s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule C n, co, h, w = s[last].op.axis co, vc = s[last].split(co, VC) oh, ow, vh, vw = s[last].tile(h, w, VH, VW) s[last].reorder(n, co, oh, ow, vh, vw, vc) if last != output: s[output].compute_inline() s[conv_out].compute_at(s[last], ow) if bc == 1: oaxis = co paxis = co else: oco, ico = s[last].split(co, bc) oaxis = oco paxis = ico s[last].parallel(paxis) s[last].pragma(oaxis, "parallel_launch_point") s[last].pragma(paxis, "parallel_stride_pattern") s[last].pragma(oaxis, "parallel_barrier_when_finish") return s
def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last): # no stride and padding info here _, IH, IW, CI, IB = data_q.shape KH, KW, _, CO, KB = kernel_q.shape _, OH, OW, _ = output.shape # Infer padding and stride if data_pad is None: padding = (0, 0) TH, TW = IH, IW else: _, TH, TW, _, _ = data_pad.shape hpad = get_const_int((TH - IH) // 2) wpad = get_const_int((TW - IW) // 2) padding = (hpad, wpad) hstride = get_const_int((TH - KH) // (OH - 1)) wstride = get_const_int((TW - KW) // (OW - 1)) stride = (hstride, wstride) wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NHWC") sch = _get_schedule(wkl, "NHWC") VH = sch.vh VW = sch.vw VC = sch.vc ba = sch.ba bc = sch.bc ##### Schedule data packing if data_pad is not None: s[data_pad].compute_inline() _, h, _, _, _, _, _ = s[data_vec].op.axis if ba == 1: oaxis = h paxis = h else: oh, ih = s[data_vec].split(h, ba) oaxis = oh paxis = ih s[data_vec].parallel(paxis) s[data_vec].pragma(oaxis, "parallel_launch_point") s[data_vec].pragma(paxis, "parallel_stride_pattern") s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule kernel packing co, _, _, _, _, _ = s[kernel_vec].op.axis if bc == 1: oaxis = co paxis = co else: oco, ico = s[kernel_vec].split(co, bc) oaxis = oco paxis = ico s[kernel_vec].parallel(paxis) s[kernel_vec].pragma(oaxis, "parallel_launch_point") s[kernel_vec].pragma(paxis, "parallel_stride_pattern") s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule Convolution n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) s[conv_out].unroll(b1) s[conv_out].unroll(b2) s[conv_out].vectorize(vc) # # Schedule output n, h, w, co = s[last].op.axis co, vc = s[last].split(co, VC) oh, ow, vh, vw = s[last].tile(h, w, VH, VW) s[last].reorder(n, oh, ow, co, vh, vw, vc) s[last].vectorize(vc) if last != output: s[output].compute_inline() s[conv_out].compute_at(s[last], ow) if bc == 1: oaxis = oh paxis = oh else: oho, iho = s[last].split(oh, bc) oaxis = oho paxis = iho s[last].parallel(paxis) s[last].pragma(oaxis, "parallel_launch_point") s[last].pragma(paxis, "parallel_stride_pattern") s[last].pragma(oaxis, "parallel_barrier_when_finish") return s
def _compute_argsort(attrs, inputs, _): axis = get_const_int(attrs.axis) is_ascend = bool(get_const_int(attrs.is_ascend)) dtype = attrs.dtype return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype)]
def _schedule_bitserial_conv2d_nchw(cfg, s, data_q, data_pad, data_vec, kernel_q, kernel_vec, conv_out, output, last): IB, _, CI, IH, IW = data_q.shape KB, CO, _, KH, KW = kernel_q.shape _, _, OH, OW = output.shape # Infer padding and stride if data_pad is None: padding = (0, 0) TH, TW = IH, IW else: _, _, _, TH, TW = data_pad.shape hpad = get_const_int((TH - IH) // 2) wpad = get_const_int((TW - IW) // 2) padding = (hpad, wpad) hstride = get_const_int((TH - KH) // (OH - 1)) wstride = get_const_int((TW - KW) // (OW - 1)) stride = (hstride, wstride) VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] ##### Schedule Data padding, and bitpacking if data_pad is not None: s[data_pad].compute_inline() _, _, h, _, _, _, _ = s[data_vec].op.axis cfg.define_split("tile_ah", cfg.axis(h), num_outputs=2, max_factor=32) oh, ih = cfg["tile_ah"].apply(s, data_vec, h) if cfg["tile_ah"].size[1] == 1: oaxis = oh paxis = oh else: oaxis = oh paxis = ih s[data_vec].parallel(paxis) s[data_vec].pragma(oaxis, "parallel_launch_point") s[data_vec].pragma(paxis, "parallel_stride_pattern") s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule Kenerl bitpacking co, _, _, _, _, _ = s[kernel_vec].op.axis cfg.define_split("tile_bco", cfg.axis(co), num_outputs=2, max_factor=32) oco, ico = cfg["tile_bco"].apply(s, kernel_vec, co) if cfg["tile_bco"].size[1] == 1: oaxis = oco paxis = oco else: oaxis = oco paxis = ico s[kernel_vec].parallel(paxis) s[kernel_vec].pragma(oaxis, "parallel_launch_point") s[kernel_vec].pragma(paxis, "parallel_stride_pattern") s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") ##### Schedule Convolution n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis ci, dh, dw, ib, kb = s[conv_out].op.reduce_axis # s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) cfg["reorder_0"].apply(s, conv_out, [n, co, oh, ow, vc, vh, vw, dh, dw, kb, ib, ci]) cfg["ann_reduce"].apply(s, conv_out, [kb, ib, dh, dw], axis_lens=[ get_const_int(kb.dom.extent), get_const_int(ib.dom.extent), get_const_int(dh.dom.extent), get_const_int(dw.dom.extent) ], max_unroll=16, cfg=cfg) s[conv_out].vectorize(vc) # # Schedule output n, co, h, w = s[last].op.axis co, vc = s[last].split(co, VC) oh, ow, vh, vw = s[last].tile(h, w, VH, VW) s[last].reorder(n, co, oh, ow, vh, vw, vc) if last != output: s[output].compute_inline() s[conv_out].compute_at(s[last], ow) oco, ico = cfg["tile_oh"].apply(s, last, co) if cfg["tile_oh"].size[1] == 1: oaxis = oco paxis = oco else: oco, ico = s[last].split(co, bc) oaxis = oco paxis = ico s[last].parallel(oco) return s
def concatenate_shape_func(attrs, inputs, _): axis = get_const_int(attrs.axis) return [_concatenate_shape_func(inputs, convert(axis))]