def conv_bn1_run(fmap_shape, filter_shape, pad, stride, dilation, use_bias=False, attrs=None): vc_util.convolution_format_check(fmap_shape, filter_shape, pad, stride, dilation) if use_bias: raise ValueError("do not support bias yet !!!") conv_dtype = 'float16' conv_param = {'stride': stride, 'pad': pad, 'dilation': dilation} stride, pad, dilation = conv_param_prepare(conv_param) fm_shape, w_shape, out_shape = conv_shape_4d(fmap_shape, filter_shape, pad, stride, dilation) IN, IC, IH, IW = fm_shape WN, WC, WH, WW = w_shape C0 = 16 input_shape = [(IN, IC // C0, IH, IW, C0), (WC // C0 * WH * WW, WN // 16, 16, C0)] mod = utils.op_build_test(conv_bn1.conv_bn1, [input_shape], [conv_dtype], op_attrs=[ fmap_shape, filter_shape, pad, stride, dilation, use_bias, attrs ], kernel_name='conv_bn1', attrs=attrs) fmap_data, filter_data, bias_data, conv_expect = \ gen_data(fmap_shape, filter_shape, pad, stride, dilation, use_bias) axes = (0, 2, 3) conv_mean = np.mean(conv_expect, axis=axes, keepdims=True) conv_square = np.power(conv_expect, 2) conv_var_part = np.mean(conv_square, axis=axes, keepdims=True) expects = (conv_expect, conv_var_part, conv_mean) out_datas = [np.full(e.shape, 0, 'float16') for e in expects] out_datas[1] = out_datas[1].astype(np.float32) out_datas[2] = out_datas[2].astype(np.float32) in_data = [fmap_data, filter_data] args = in_data for out in out_datas: args.append(out) args = tuple(args) outputs = utils.mod_launch(mod, args, outputs=(-3, -2, -1), expect=expects) rtol, atol = get_rtol_atol("conv_bn1", conv_dtype) cmp_res = list( map(lambda x, y: compare_tensor(x, y, rtol=rtol, atol=atol), outputs, expects)) return (fmap_data, filter_data, bias_data), outputs, expects, all(cmp_res)
def conv_backprop_filter(data, fmap_shape, filter_shape, pad_, stride_, dilation_, attrs=None): """ Computes dw according "conv forward". Args: data (list[tvm.tensor.Tensor]): list with length 2. data[0](consider as dy) Tensor of type float16 ,shape 5D(out_n, out_c//C0, out_h, out_w,C0) data[1](consider as x) Tensor of type float16 ,shape 5D(fN,fC//C0,fH,fW,C0) fmap_shape (list[int]): [fN, fC, fH, fW] filter_shape (list[int]): [wN, wC, wH, wW] pad_ (list[int]): [pad_left, pad_right, pad_top, pad_bottom] stride_ (list[int]): [stride_h, stride_w] dilation_ (list[int]): [dilation_h, dilation_w] attrs (dict): a dict with keys like conv_tile,bypass. Returns: tvm.tensor.Tensor. configs. """ if len(data) != 2: raise IndexError("data contains output tensor and feature map tensor") vc_util.convolution_format_check(fmap_shape, filter_shape, pad_, stride_, dilation_) block_size = 16 in_n, in_c, in_h, in_w = fmap_shape cout, _, w_h, w_w = filter_shape in_c = (in_c + block_size - 1) // block_size * block_size cout = (cout + block_size - 1) // block_size * block_size pad_top, pad_bottom, pad_left, pad_right = pad_ stride_h, stride_w = stride_ dilation_h, dilation_w = dilation_ if dilation_h != 1 or dilation_w != 1: raise ValueError("The value of elements in dilation must be 1") out_n = in_n out_c = cout out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1 out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1 dy_shape = (out_n, out_c, out_h, out_w) dx_shape = (in_n, in_c, in_h, in_w) dw_shape = (cout, in_c, w_h, w_w) key = gen_key(fmap_shape, filter_shape, pad_, stride_, dilation_) res_c, configs = conv_backprop_filter_compute(data, dx_shape, dw_shape, dy_shape, pad_, stride_, dilation_, block_size=block_size, attrs=attrs, key=key) return res_c, configs
def conv_core(data, fmap_shape, filter_shape, pad, stride, dilation, use_bias=False, attrs=None): """core computation for op conv.""" if use_bias: if len(data) != 3: raise IndexError( "data should contain 3 tensors, i.e. feature map, filter and bias" ) if data[2].dtype != "float16": raise TypeError("data type of bias should be float16") else: if len(data) != 2: raise IndexError( "data should contain 2 tensors, i.e. feature map and filter") if data[0].dtype != "float16": raise TypeError("data type of feature map should be float16") if data[1].dtype != "float16": raise TypeError("data type of filter should be float16") if not isinstance(use_bias, bool): raise TypeError("use_bias should be set as False or True") all_dynamic = 0 # kh kw pad stride partial_dynamic = 0 # fn fc1 fh fw wN wC if attrs is None: attrs = {} if attrs.get("dynamic"): all_dynamic = 1 if attrs.get("partial_dynamic"): partial_dynamic = 1 dynamic = partial_dynamic or all_dynamic dynamic_tiling = 1 if attrs.get("dynamic") else 0 use_autotiling = 1 if dynamic and not dynamic_tiling else 0 block_size = 16 if not dynamic: vc_util.convolution_format_check(fmap_shape, filter_shape, pad, stride, dilation) for tmp_data in data: shape = [x.value for x in tmp_data.shape] vc_util.check_shape(shape) vc_util.check_shape(fmap_shape) vc_util.check_shape(filter_shape) stride_len = 2 pad_len = 4 dilation_len = 2 zero = 0 max_s = 63 max_d = 255 if isinstance(stride, int): stride = [stride] * stride_len elif isinstance( stride, (list, tuple)) and len(stride) == 1: # only has one element stride = list(stride) * stride_len elif isinstance(stride, (list, tuple)) and len(stride) == stride_len: pass else: raise IndexError("stride para illegal !!!") if not dynamic: for val in stride: if val <= zero: raise ValueError( "elements in stride should be greater than Zero !!!") if val > max_s: raise ValueError( "elements in stride should be less than 64 !!!") if isinstance(pad, int): pad = [pad] * pad_len elif isinstance(pad, (list, tuple)) and len(pad) == 1: # only has one element pad = list(pad) * pad_len elif isinstance(pad, (list, tuple)) and len(pad) == pad_len: pass else: raise IndexError("pad para illegal !!!") if not dynamic: for val in pad: if val < zero: raise ValueError( "elements in pad should not be less than Zero !!!") if val > max_d: raise ValueError("elements in pad should be less than 256 !!!") if isinstance(dilation, int): dilation = [dilation] * dilation_len elif isinstance( dilation, (list, tuple)) and len(dilation) == 1: # only has one element dilation = list(dilation) * dilation_len elif isinstance(dilation, (list, tuple)) and len(dilation) == dilation_len: pass else: raise IndexError("dilation para illegal !!!") for val in dilation: if val <= zero: raise ValueError( "elements in dilation should be greater than Zero !!!") if val > max_d: raise ValueError( "elements in dilation should be less than 256 !!!") if len(stride) != stride_len or len(pad) != pad_len or len( dilation) != dilation_len: raise IndexError(" shape of parameters must be as expected") block_size_sub_one = block_size - 1 # input shape (NCHW -> NC1HWC0) in_n, in_c, in_h, in_w = fmap_shape in_c = (in_c + block_size_sub_one) // block_size * block_size # kernel shape (NCHW -> NC1HWC0 -> Fractal) k_n, k_c, k_h, k_w = filter_shape k_c = (k_c + block_size_sub_one) // block_size * block_size k_n = (k_n + block_size_sub_one) // block_size * block_size # padding(padding_top, padding_bottom, padding_left, padding_right) p_top, p_bottom, p_left, p_right = pad # stride (stride_h, stride_w) s_h, s_w = stride k_h_real = k_h k_w_real = k_w p_top_real = p_top p_bottom_real = p_bottom p_left_real = p_left p_right_real = p_right s_h_real = s_h s_w_real = s_w if dynamic_tiling: k_h = k_h_fake k_w = k_w_fake p_top = p_top_fake p_bottom = p_bottom_fake p_left = p_left_fake p_right = p_right_fake s_h = s_h_fake s_w = s_w_fake # dilation (dilation_h, dilation_w) d_h, d_w = dilation # tiling key = [] key.append(tuple(fmap_shape)) key.append(tuple(filter_shape)) key.append(tuple(pad)) key.append(tuple(stride)) key.append(tuple(dilation)) key.append(use_bias) hash_key = str(tuple(key)) k_w_d = (k_w - 1) * d_w + 1 out_w = (in_w + p_left + p_right - k_w_d) // (s_w) + 1 bypass_list = [0, 1] bypass = 0 if dynamic else 1 # (NC1HWCO) a_value = data[0] # (fractal) b_value = data[1] setdim_map = conv_set_dim_map conv_tile_num = 5 if attrs is not None and "conv_tile" in attrs and len( attrs["conv_tile"]) >= conv_tile_num: use_autotiling = 0 tile_hh = attrs["conv_tile"][0] tile_coco = attrs["conv_tile"][1] tile_mm = attrs["conv_tile"][2] tile_kk = attrs["conv_tile"][3] tile_nn = attrs["conv_tile"][4] if len(attrs["conv_tile"]) > conv_tile_num: tile_ww = attrs["conv_tile"][conv_tile_num] else: tile_ww = (out_w - 1) * s_w + k_w_d if "bypass" in attrs: bypass = attrs["bypass"] elif hash_key in setdim_map: configs = setdim_map[hash_key] if isinstance(configs, tuple): tiles = configs[0] if "bypass" in configs[1]: bypass = configs[1]["bypass"] else: tiles = configs if len(tiles) > conv_tile_num: tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww = tiles else: tile_hh, tile_coco, tile_mm, tile_kk, tile_nn = tiles tile_ww = (out_w - 1) * s_w + k_w_d else: win_cut_h = 1 k_h_d = (k_h - 1) * d_h + 1 win_h = (in_h + p_top + p_bottom - k_h_d) // (s_h) + 1 if not dynamic: while win_cut_h <= win_h: if (((win_h + win_cut_h - 1) // win_cut_h - 1) * win_cut_h - 1) * s_h + k_h_d <= in_h + p_top: break win_cut_h += 1 tile_hh = (win_cut_h - 1) * s_h + k_h_d tile_ww = (out_w - 1) * s_w + k_w_d tile_coco = block_size tile_mm = block_size tile_kk = block_size tile_nn = block_size if bypass not in bypass_list: raise ValueError("bypass of conv only supports %s" % (",".join(str(bypass_list)))) if tile_hh == in_h: tile_hh += p_top + p_bottom if tile_ww == in_w: tile_ww += p_left + p_right tile_coco = (tile_coco + block_size_sub_one) // block_size * block_size tile_mm = (tile_mm + block_size_sub_one) // block_size * block_size tile_kk = (tile_kk + block_size_sub_one) // block_size * block_size tile_nn = (tile_nn + block_size_sub_one) // block_size * block_size input_shape_nc1hwc0 = get_shape(data[0]) if not dynamic and input_shape_nc1hwc0 != [ in_n, in_c // block_size, in_h, in_w, block_size ]: raise ValueError("feature map tensor data[0] shape illegal !!!") in_n, c1_in, in_h, in_w, _ = input_shape_nc1hwc0 if not dynamic: kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size) else: kernel_shape_nc1hwc0 = (k_n, c1_in, k_h, k_w, block_size ) # simplify for dynamic case k_n, k_c1, k_h, k_w, k_c0 = kernel_shape_nc1hwc0 kernel_shape_fractal = get_shape(data[1]) if not dynamic and kernel_shape_fractal != [ k_c1 * k_h * k_w, k_n // block_size, block_size, k_c0 ]: raise ValueError("filter tensor data[1] shape illegal !!!") if use_bias: bias_value = data[2] bias_name = bias_value.op.name bias_shape = [x.value for x in data[2].shape] if bias_shape != [1, k_n // block_size, 1, 1, block_size]: raise ValueError("bias tensor data[2] shape illegal !!!") else: bias_name = "None" bias_value = None # Create reduction variables kc1 = akg.tvm.reduce_axis((0, k_c1), name="kc1") kh = akg.tvm.reduce_axis((0, k_h), name="kh") kw = akg.tvm.reduce_axis((0, k_w), name="kw") kc0 = akg.tvm.reduce_axis((0, k_c0), name="kc0") k_h_d = (k_h - 1) * d_h + 1 k_h_d_real = (k_h_real - 1) * d_h + 1 k_w_d = (k_w - 1) * d_w + 1 k_w_d_real = (k_w_real - 1) * d_w + 1 out_h = (in_h + p_top + p_bottom - k_h_d) // (s_h) + 1 tile_out_h = (tile_hh - k_h_d) // s_h + 1 tile_out_h_real = (tile_hh - k_h_d_real) // s_h_real + 1 out_w = (in_w + p_left + p_right - k_w_d) // (s_w) + 1 tile_out_w = (tile_ww - k_w_d) // s_w + 1 tile_out_w_real = (tile_ww - k_w_d_real) // s_w_real + 1 if not dynamic: out_shape_nc1hwc0 = (in_n, k_n // block_size, out_h, out_w, block_size) else: _, c1_out, _, _ = data[1].shape out_shape_nc1hwc0 = (in_n, c1_out, out_h, out_w, block_size) _, out_c1, out_h, out_w, _ = out_shape_nc1hwc0 if tile_coco > 0: c1_cut = tile_coco // block_size else: c1_cut = out_c1 # Compute the convolution output_name = "output0" conv_attr = { "pragma_conv_kernel_n": k_n, "pragma_conv_kernel_h": k_h, "pragma_conv_kernel_w": k_w, "pragma_conv_padding_top": p_top, "pragma_conv_padding_bottom": p_bottom, "pragma_conv_padding_left": p_left, "pragma_conv_padding_right": p_right, "pragma_conv_bypass_l1": bypass, "pragma_conv_stride_h": s_h, "pragma_conv_stride_w": s_w, "pragma_conv_dilation_h": d_h, "pragma_conv_dilation_w": d_w, "pragma_conv_fm_n": in_n, "pragma_conv_fm_c": in_c, "pragma_conv_fm_h": in_h, "pragma_conv_fm_w": in_w, "feature": a_value.op.name, "filter": b_value.op.name, "bias": bias_name, "res": output_name } if dynamic_tiling: conv_attr["pragma_conv_h_cut"] = (tile_out_h_fake - 1) * s_h + k_h_d conv_attr["pragma_conv_w_cut"] = (tile_out_w_fake - 1) * s_w + k_w_d conv_attr["pragma_conv_co_cut"] = c1_cut_fake * 16 conv_attr["pragma_conv_m_cut"] = m_cut_fake conv_attr["pragma_conv_k_cut"] = k_cut_fake conv_attr["pragma_conv_n_cut"] = n_cut_fake conv_attr["pragma_conv_tile_co"] = c1_cut conv_attr["pragma_conv_tile_ho"] = tile_out_h_real conv_attr["pragma_conv_tile_wo"] = tile_out_w_real conv_attr["pragma_conv_tile_mo"] = tile_mm // 16 conv_attr["pragma_conv_tile_ko"] = tile_kk // 16 conv_attr["pragma_conv_tile_no"] = tile_nn // 16 conv_attr["pragma_conv_real_kh"] = k_h_real conv_attr["pragma_conv_real_kw"] = k_w_real conv_attr["pragma_conv_real_sh"] = s_h_real conv_attr["pragma_conv_real_sw"] = s_w_real conv_attr["pragma_conv_real_pt"] = p_top_real conv_attr["pragma_conv_real_pb"] = p_bottom_real conv_attr["pragma_conv_real_pl"] = p_left_real conv_attr["pragma_conv_real_pr"] = p_right_real elif not use_autotiling: conv_attr["pragma_conv_h_cut"] = (tile_out_h - 1) * s_h + k_h_d conv_attr["pragma_conv_w_cut"] = (tile_out_w - 1) * s_w + k_w_d conv_attr["pragma_conv_co_cut"] = c1_cut * k_c0 conv_attr["pragma_conv_m_cut"] = tile_mm conv_attr["pragma_conv_k_cut"] = tile_kk conv_attr["pragma_conv_n_cut"] = tile_nn c_value = akg.tvm.compute( out_shape_nc1hwc0, lambda n, c1, h, w, c0: akg.lang.cce.mmad((akg.tvm.if_then_else( akg.tvm.any((h * s_h + kh) < p_top, (h * s_h + kh) > (in_h + p_top - 1), (w * s_w + kw) < p_left, (w * s_w + kw) > (in_w + p_left - 1)), akg.tvm.const(0.0, "float16"), a_value[n, kc1, (h * s_h + (kh * d_h) - p_top), (w * s_w + (kw * d_w) - p_left), kc0]) * b_value[ (kc1 * k_h + kh) * k_w + kw, c1, c0, kc0]).astype( "float32"), axis=[kc1, kh, kw, kc0]), name=output_name, attrs=conv_attr) return c_value
def conv_run(fmap_shape, filter_shape, pad, stride, dilation, use_bias=False, attrs=None, dump_data=False): conv_dtype = 'float16' vc_util.convolution_format_check(fmap_shape, filter_shape, pad, stride, dilation) conv_param = {'stride': stride, 'pad': pad, 'dilation': dilation} stride, pad, dilation = conv_param_prepare(conv_param) fm_shape, w_shape, out_shape = conv_shape_4d(fmap_shape, filter_shape, pad, stride, dilation) IN, IC, IH, IW = fm_shape WN, WC, WH, WW = w_shape C0 = 16 if use_bias: input_shape = [(IN, IC // C0, IH, IW, C0), (WC // C0 * WH * WW, WN // 16, 16, C0), (1, WN // 16, 1, 1, 16)] else: input_shape = [(IN, IC // C0, IH, IW, C0), (WC // C0 * WH * WW, WN // 16, 16, C0)] input_file = os.environ.get("RANDOM_DATA_DISK_PATH", "") expect_file = input_file + "/" + gen_kernel_name( [input_shape], [conv_dtype], op_attrs=[ fmap_shape, filter_shape, pad, stride, dilation, use_bias, attrs ], kernel_name='conv', attrs=attrs) + ".bin" all_dynamic = 0 # kh kw pad stride partial_dynamic = 0 # fn fc1 fh fw wN wC if attrs.get("dynamic"): all_dynamic = 1 print("=================all dynamic==================") if attrs.get("partial_dynamic"): partial_dynamic = 1 print("=================partial dynamic==================") dynamic = partial_dynamic or all_dynamic if not dynamic: print("=================static shape==================") if dynamic: fmap_shape_real = fmap_shape filter_shape_real = filter_shape pad_real = pad stride_real = stride dilation_real = dilation if partial_dynamic or all_dynamic: N = tvm.var("N") C = tvm.var("CI") CI1 = tvm.var("CI1") H = tvm.var("H") W = tvm.var("W") COUT = tvm.var("CO") CO1 = tvm.var("CO1") _, _, KH, KW = filter_shape SH, SW = stride PT, PB, PL, PR = pad params = () if all_dynamic: PARAM_KH = tvm.var("KH") PARAM_KW = tvm.var("KW") PARAM_PT = tvm.var("PT") PARAM_PB = tvm.var("PB") PARAM_PL = tvm.var("PL") PARAM_PR = tvm.var("PR") PARAM_SH = tvm.var("SH") PARAM_SW = tvm.var("SW") PARAM_T1_0_H = tvm.var("T1_0_H") PARAM_T1_0_W = tvm.var("T1_0_W") PARAM_T1_0_C1 = tvm.var("T1_0_C1") PARAM_T0_0_MO = tvm.var("T0_0_MO") PARAM_T0_0_NO = tvm.var("T0_0_NO") PARAM_T0_0_KO = tvm.var("T0_0_KO") params = (PARAM_KH, PARAM_KW, PARAM_PT, PARAM_PB, PARAM_PL, PARAM_PR, PARAM_SH, PARAM_SW, PARAM_T1_0_H, PARAM_T1_0_W, PARAM_T1_0_C1, PARAM_T0_0_MO, PARAM_T0_0_NO, PARAM_T0_0_KO) DEBUG = 1 if dynamic: KH_FAKE = 11 KW_FAKE = 31 fmap_shape = (N, C, H, W) filter_shape = (COUT, C, KH, KW) if not DEBUG: CO1 = (COUT + 15) // 16 CI1 = (C + 15) // 16 if use_bias: # input_shape = [(IN, IC // C0, IH, IW, C0), (WC // C0 * WH * WW, WN // 16, 16, C0), (1, WN // 16, 1, 1, 16)] if all_dynamic: input_shape = [(N, CI1, H, W, 16), (CI1 * KH_FAKE * KW_FAKE, CO1, 16, 16), (1, CO1, 1, 1, 16)] else: input_shape = [(N, CI1, H, W, 16), (CI1 * KH * KW, CO1, 16, 16), (1, CO1, 1, 1, 16)] else: # input_shape = [(IN, IC // C0, IH, IW, C0), (WC // C0 * WH * WW, WN // 16, 16, C0)] if all_dynamic: input_shape = [(N, CI1, H, W, 16), (CI1 * KH_FAKE * KW_FAKE, CO1, 16, 16)] else: input_shape = [(N, CI1, H, W, 16), (CI1 * KH * KW, CO1, 16, 16)] mod = utils.op_build_test(Conv, [input_shape], [conv_dtype], op_attrs=[ fmap_shape, filter_shape, pad, stride, dilation, use_bias, attrs, params ], kernel_name='conv', attrs=attrs) fmap_data, filter_data, bias_data, expect = gen_data( fmap_shape_real, filter_shape_real, pad_real, stride_real, dilation_real, use_bias, expect_file) else: mod = utils.op_build_test(Conv, [input_shape], [conv_dtype], op_attrs=[ fmap_shape, filter_shape, pad, stride, dilation, use_bias, attrs ], kernel_name='conv', attrs=attrs) fmap_data, filter_data, bias_data, expect = gen_data( fmap_shape, filter_shape, pad, stride, dilation, use_bias, expect_file) if dump_data: with open('input.bin', 'wb') as fo: fo.write(fmap_data.astype(np.float16, copy=False)) with open('filter.bin', 'wb') as fo: fo.write(filter_data.astype(np.float16, copy=False)) with open('bias.bin', 'wb') as fo: fo.write(bias_data.astype(np.float16, copy=False)) with open('output.bin', 'wb') as fo: fo.write(expect.astype(np.float16, copy=False)) out_data = np.full(expect.shape, np.nan, 'float16') if use_bias: input = [fmap_data, filter_data, bias_data] else: input = [fmap_data, filter_data] flag_w = os.environ.get("WRITE_TO_DISK", "No") if flag_w == "Yes": return input, out_data, expect, True if not dynamic: args = input args.append(out_data) args = tuple(args) out_data = utils.mod_launch(mod, args, expect=expect) else: args = [] args.append(fmap_data) args.append(filter_data) args.append(out_data) if partial_dynamic or all_dynamic: args.append(IN) args.append(IC) args.append(IH) args.append(IW) args.append(WN) if all_dynamic: args.append(KH) args.append(KW) args.append(PT) args.append(PB) args.append(PL) args.append(PR) args.append(SH) args.append(SW) if attrs.get("conv_tile") and len(attrs["conv_tile"]) == 7: T1_0_H = attrs["conv_tile"][0] T1_0_C1 = attrs["conv_tile"][1] T0_0_MO = attrs["conv_tile"][2] T0_0_KO = attrs["conv_tile"][3] T0_0_NO = attrs["conv_tile"][4] T1_0_W = attrs["conv_tile"][5] if T1_0_H == IH: T1_0_H += PT + PB T1_0_H_cut = (T1_0_H - KH) // SH + 1 if T1_0_W == IW: T1_0_W += PL + PR T1_0_W_cut = (T1_0_W - KW) // SW + 1 args.append(T1_0_H_cut) args.append(T1_0_W_cut) args.append((T1_0_C1 + 15) // 16) args.append((T0_0_MO + 15) // 16) args.append((T0_0_NO + 15) // 16) args.append((T0_0_KO + 15) // 16) if DEBUG: args.append(IC // 16) args.append(WN // 16) block_dim = min(32, IN) args.append(block_dim) out_data = utils.mod_launch(mod, args, outputs=(2, ), expect=expect) rtol, atol = get_rtol_atol("conv", conv_dtype) return input, out_data, expect, compare_tensor(out_data, expect, rtol=rtol, atol=atol, equal_nan=True)
def _get_space_conv(op_desc: ConvDesc): """get config space of convolution""" if not isinstance(op_desc, ConvDesc): raise TypeError('op_desc must be ConvDesc') stride_ = op_desc.stride pad_ = op_desc.pad dilation_ = op_desc.dilation vc_util.convolution_format_check(op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) config_space = ListConfigSpace(ConvConfig) # if double buff is not enabled, set it's value to 1 size_scale = 1 l1_max_size = (1024 * 1024) // size_scale l0a_max_size = (64 * 1024) // size_scale l0b_max_size = (64 * 1024) // size_scale l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 _, in_c, in_h, in_w = op_desc.fmap_shape k_n, _, k_h, k_w = op_desc.filter_shape padding = (pad_[0], pad_[1], pad_[2], pad_[3]) p_top, p_bottom, p_left, p_right = padding s_h, s_w = stride_ in_c = ((in_c - 1) // 16 + 1) * 16 tile_c = in_c tile_co_start = 16 data_len = 2 h_max = in_h + p_top + p_bottom win_h = (h_max - k_h) // s_h + 1 h_max = (h_max - k_h) // s_h * s_h + k_h w_max = in_w + p_left + p_right win_w = (w_max - k_w) // s_w + 1 w_max = (w_max - k_w) // s_w * s_w + k_w bypass_options = [0, 1] for bypass in bypass_options: for tile_h in range(h_max, k_h - 1, -s_h): size_h = tile_h if tile_h == h_max: w_range = range(w_max, k_w - 1, -s_w) size_h = in_h else: w_range = [w_max] win_tile_h = (tile_h - k_h) // s_h + 1 h_tiles = (win_h + win_tile_h - 1) // win_tile_h if h_tiles == 2: size_h = max(tile_h - p_top, in_h + p_top - tile_h + k_h - s_h) for tile_w in w_range: size_w = tile_w if size_w == w_max: size_w = in_w else: win_tile_w = (tile_w - k_w) // s_w + 1 w_tiles = (win_w + win_tile_w - 1) // win_tile_w if w_tiles == 2: size_w = max(tile_w - p_left, in_w + p_left - tile_w + k_w - s_w) k_n_ = ((k_n - 1) // 16 + 1) * 16 co_range = range(k_n_, tile_co_start - 1, -16) for tile_co in co_range: if bypass == 1: if tile_co != k_n: continue l1_size = data_len * (size_h * size_w * in_c) else: l1_size = data_len * (size_h * size_w * in_c + tile_co * tile_c * k_h * k_w) if l1_size > l1_max_size: continue tile_co_ = ((tile_co - 1) // 16 + 1) * 16 for tile_n in range(tile_co_, 15, -16): k_max = in_c * k_h * k_w k_max_ = ((k_max - 1) // 16 + 1) * 16 k_size = l0b_max_size // data_len // tile_n k_size_ = k_size // 16 * 16 for tile_k in range(min(k_max_, k_size_), 15, -16): m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * (int(( (tile_w - k_w) // (s_w)) + 1)) m_max_ = ((m_max - 1) // 16 + 1) * 16 m_size1 = l0a_max_size // data_len // tile_k m_size1_ = m_size1 // 16 * 16 m_size2 = l0c_max_size // data_len // tile_n m_size2_ = m_size2 // 16 * 16 for tile_m in range( min(m_max_, m_size1_, m_size2_), 15, -16): config_space.add( ConvConfig(tile_h, tile_co, tile_m, tile_k, tile_n, tile_w, bypass)) return None, config_space, op_desc.__str__(), None, None
def _get_space_conv_backprop_filter(op_desc: ConvBackpropDesc): """get config space of convolution backwprop filter""" if not isinstance(op_desc, ConvBackpropDesc): raise TypeError('op_desc must be ConvBackpropDesc') stride_ = op_desc.stride pad_ = op_desc.pad dilation_ = op_desc.dilation vc_util.convolution_format_check(op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) config_space = ListConfigSpace(ConvBackpropFilterConfig) # if double buff is not enabled, set it's value to 1 size_scale = 1 block_size = 16 l1_max_size = (1024 * 1024) // size_scale l0a_max_size = (64 * 1024) // size_scale l0b_max_size = (64 * 1024) // size_scale l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 in_n, in_c, in_h, in_w = op_desc.fmap_shape cout, _, k_h, k_w = op_desc.filter_shape k_n = cout in_c = (in_c + block_size - 1) // block_size * block_size cout = (cout + block_size - 1) // block_size * block_size pad_top, pad_bottom, pad_left, pad_right = pad_ s_h, s_w = stride_ tile_co_start = 16 tile_ci_start = 16 data_len = 2 h_max = in_h + pad_top + pad_bottom win_h = (h_max - k_h) // s_h + 1 h_max = (h_max - k_h) // s_h * s_h + k_h w_max = in_w + pad_left + pad_right win_w = (w_max - k_w) // s_w + 1 w_max = (w_max - k_w) // s_w * s_w + k_w for tile_h in range(h_max, k_h - 1, -s_h): size_h = tile_h win_tile_h = (tile_h - k_h) // s_h + 1 # Only one head for cut H axis if win_tile_h * s_h < pad_top: continue # Only one tail for cut H axis if (((win_h + win_tile_h - 1) // win_tile_h - 1) * win_tile_h - 1) * s_h + k_h > in_h + pad_top: continue if tile_h == h_max: w_range = range(w_max, k_w - 1, -s_w) size_h = in_h else: w_range = [w_max] h_tiles = (win_h + win_tile_h - 1) // win_tile_h if h_tiles == 2: size_h = max(tile_h - pad_top, in_h + pad_top - tile_h + k_h - s_h) for tile_w in w_range: size_w = tile_w win_tile_w = (tile_w - k_w) // s_w + 1 # Only one head for cut W axis if win_tile_w * s_w < pad_left: continue # Only one tail for cut W axis if (((win_w + win_tile_w - 1) // win_tile_w - 1) * win_tile_w - 1) * s_w + k_w > in_w + pad_left: continue if size_w == w_max: size_w = in_w else: w_tiles = (win_w + win_tile_w - 1) // win_tile_w if w_tiles == 2: size_w = max(tile_w - pad_left, in_w + pad_left - tile_w + k_w - s_w) for tile_kh in range(k_h, 0, -1): for tile_kw in range(k_w, 0, -1): k_n_ = ((k_n - 1) // 16 + 1) * 16 co_range = range(k_n_, tile_co_start - 1, -16) for tile_co in co_range: in_c_ = ((in_c - 1) // 16 + 1) * 16 ci_range = range(in_c_, tile_ci_start - 1, -16) for tile_ci in ci_range: tile_batch = 1 l1_size = data_len * tile_batch * ( tile_co * win_tile_h * win_tile_w + tile_ci * size_h * size_w) if l1_size > l1_max_size: continue if (tile_batch != in_n or tile_co != k_n_ or tile_ci != in_c_): tile_m = tile_co tile_n = tile_ci * tile_kh * tile_kw l0c_size = data_len * tile_n * tile_m if l0c_size > l0c_max_size: continue k_max = tile_batch * tile_h * tile_w k_max_ = ((k_max - 1) // 16 + 1) * 16 k_size1 = l0a_max_size // data_len // tile_m k_size1_ = k_size1 // 16 * 16 k_size2 = l0b_max_size // data_len // tile_n k_size2_ = k_size2 // 16 * 16 for tile_k in range( min(k_max_, k_size1_, k_size2_), 15, -16): config_space.add( ConvBackpropFilterConfig( tile_ci, tile_kh, tile_kw, tile_co, tile_batch, tile_h, tile_w, tile_m, tile_k, tile_n)) else: for tile_n in range( tile_ci * tile_kh * tile_kw, 15, -16): k_max = tile_batch * tile_h * tile_w k_max_ = ((k_max - 1) // 16 + 1) * 16 k_size = l0b_max_size // data_len // tile_n k_size_ = k_size // 16 * 16 for tile_k in range( min(k_max_, k_size_), 15, -16): m_max = tile_co m_max_ = ((m_max - 1) // 16 + 1) * 16 m_size1 = l0a_max_size // data_len // tile_k m_size1_ = m_size1 // 16 * 16 m_size2 = l0c_max_size // data_len // tile_n m_size2_ = m_size2 // 16 * 16 for tile_m in range( min(m_max_, m_size1_, m_size2_), 15, -16): config_space.add( ConvBackpropFilterConfig( tile_ci, tile_kh, tile_kw, tile_co, tile_batch, tile_h, tile_w, tile_m, tile_k, tile_n)) return None, config_space, op_desc.__str__(), None, None
def _get_space_conv_backprop_input(op_desc: ConvBackpropDesc): """get config space of convolution backprop input""" if not isinstance(op_desc, ConvBackpropDesc): raise TypeError('op_desc must be ConvDesc') stride_ = op_desc.stride pad_ = op_desc.pad dilation_ = op_desc.dilation vc_util.convolution_format_check(op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) config_space = ListConfigSpace(ConvBackpropInputConfig) # if double buff is not enabled, set it's value to 1 size_scale = 1 block_size = 16 l1_max_size = (1024 * 1024) // size_scale l0a_max_size = (64 * 1024) // size_scale l0b_max_size = (64 * 1024) // size_scale l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 ub_max_size = l0c_max_size _, in_c, in_h, in_w = op_desc.fmap_shape k_n, _, k_h, k_w = op_desc.filter_shape in_c = (in_c + block_size - 1) // block_size * block_size k_n = (k_n + block_size - 1) // block_size * block_size pad_top, pad_bottom, pad_left, pad_right = pad_ stride_h, stride_w = stride_ out_c = k_n out_h = (in_h + pad_top + pad_bottom - k_h) // stride_h + 1 out_w = (in_w + pad_left + pad_right - k_w) // stride_w + 1 out_h = out_h * stride_h out_w = out_w * stride_w p_top = k_h - pad_[0] - 1 p_bottom = in_h + pad_[0] - stride_[0] * ( (in_h + pad_[0] + pad_[1] - k_h) // stride_[0] + 1) p_left = k_w - pad_[2] - 1 p_right = in_w + pad_[2] - stride_[1] * ( (in_w + pad_[2] + pad_[3] - k_w) // stride_[1] + 1) s_h = 1 s_w = 1 tile_c = out_c tile_co_start = 16 data_len = 2 h_max = out_h + p_top + p_bottom win_h = (h_max - k_h) // s_h + 1 h_max = (h_max - k_h) // s_h * s_h + k_h w_max = out_w + p_left + p_right win_w = (w_max - k_w) // s_w + 1 w_max = (w_max - k_w) // s_w * s_w + k_w for tile_h in range(h_max, k_h - 1, -s_h): size_h = tile_h if tile_h == h_max: w_range = range(w_max, k_w - 1, -s_w) size_h = in_h else: w_range = [w_max] win_tile_h = (tile_h - k_h) // s_h + 1 h_tiles = (win_h + win_tile_h - 1) // win_tile_h if h_tiles == 2: size_h = max(tile_h - p_top, in_h + p_top - tile_h + k_h - s_h) for tile_w in w_range: size_w = tile_w if size_w == w_max: size_w = in_w else: win_tile_w = (tile_w - k_w) // s_w + 1 w_tiles = (win_w + win_tile_w - 1) // win_tile_w if w_tiles == 2: size_w = max(tile_w - p_left, in_w + p_left - tile_w + k_w - s_w) k_n_ = ((k_n - 1) // 16 + 1) * 16 co_range = range(k_n_, tile_co_start - 1, -16) for tile_co in co_range: l1_size = data_len * (size_h * size_w * out_c + tile_co * tile_c * k_h * k_w) if l1_size > l1_max_size: continue ub_size = data_len * (size_h * size_w * out_c) if ub_size > ub_max_size: continue tile_co_ = ((tile_co - 1) // 16 + 1) * 16 for tile_n in range(tile_co_, 15, -16): k_max = out_c * k_h * k_w k_base = 16 * k_h * k_w k_max_ = ((k_max - 1) // k_base + 1) * k_base k_size = l0b_max_size // data_len // tile_n k_size_ = k_size // k_base * k_base for tile_k in range(min(k_max_, k_size_), k_base - 1, -k_base): m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * (int(((tile_w - k_w) // (s_w)) + 1)) m_max_ = ((m_max - 1) // 16 + 1) * 16 m_size1 = l0a_max_size // data_len // tile_k m_size1_ = m_size1 // 16 * 16 m_size2 = l0c_max_size // data_len // tile_n m_size2_ = m_size2 // 16 * 16 for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): config_space.add( ConvBackpropInputConfig( tile_h, tile_co, tile_m, tile_k, tile_n, tile_w)) return None, config_space, op_desc.__str__(), None, None
def conv_backprop_input_run(fmap_shape, filter_shape, pad_, stride_, dilation_, attrs=None): conv_dtype = 'float16' block_size = 16 vc_util.convolution_format_check(fmap_shape, filter_shape, pad_, stride_, dilation_) in_n, in_c, in_h, in_w = fmap_shape cout, cin, w_h, w_w = filter_shape in_c = (in_c + block_size - 1) // block_size * block_size cout = (cout + block_size - 1) // block_size * block_size pad_top, pad_bottom, pad_left, pad_right = pad_ stride_h, stride_w = stride_ out_n = in_n out_c = cout out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1 out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1 x_shape = (out_n, out_c, out_h, out_w) w_shape = (cout, in_c, w_h, w_w) inN, inC, inH, inW = x_shape input_shape_nc1hwc0 = (inN, inC // block_size, inH, inW, block_size) k_n, k_c, k_h, k_w = w_shape kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size) k_n, k_c1, k_h, k_w, k_c0 = kernel_shape_nc1hwc0 kernel_shape_fractal = (k_c // block_size * k_h * k_w, k_n // block_size, block_size, block_size) input_shape = [input_shape_nc1hwc0, kernel_shape_fractal] input_file = os.environ.get("RANDOM_DATA_DISK_PATH", "") expect_file = input_file + "/" + gen_kernel_name( [input_shape], [conv_dtype], op_attrs=[fmap_shape, filter_shape, pad_, stride_, dilation_, attrs], kernel_name='conv_backprop_input', attrs=attrs) + ".bin" fmap_data, filter_data, expect = gen_data(fmap_shape, filter_shape, pad_, stride_, dilation_, expect_file, attrs=attrs) out_data = np.full(expect.shape, np.nan, 'float16') input = (fmap_data, filter_data) flag_w = os.environ.get("WRITE_TO_DISK", "No") if flag_w == "Yes": return input, out_data, expect, True mod = utils.op_build_test( conv_backprop_input, [input_shape], [conv_dtype], op_attrs=[fmap_shape, filter_shape, pad_, stride_, dilation_, attrs], kernel_name='conv_backprop_input', attrs=attrs) args = (fmap_data, filter_data, out_data) out_data = utils.mod_launch(mod, args, expect=expect) rtol, atol = get_rtol_atol("conv_backprop_input", conv_dtype) return input, out_data, expect, compare_tensor(out_data, expect, rtol=rtol, atol=atol, equal_nan=True)