def benchmark(input_0, kernel, stride, pad): sh, sw = stride n, c1, h, w, c0 = input_0.shape kh, kw = kernel [ph_h, ph_t, pw_h, pw_t], [out_size_h, out_size_w] = \ cal_pad_shapes_by_strategy(input_0.shape, kernel, stride, pad) out_size_w = get_value(out_size_w, akg.tvm.expr.IntImm) out_size_h = get_value(out_size_h, akg.tvm.expr.IntImm) out_shape = (n, c1, out_size_h, out_size_w, c0) mask_shape = (n, c1, kh, kw, out_size_h, out_size_w, c0) min_value = -65504.0 if input_0.dtype == 'float16' \ else -340282346638528859811704183484516925440.0 out = np.full(out_shape, min_value, dtype=input_0.dtype) mask = np.zeros(mask_shape) inputpad = np.full((n, c1, h + ph_h + ph_t, w + pw_h + pw_t, c0), np.finfo(input_0.dtype).min, dtype=input_0.dtype) inputpad[:, :, ph_h:ph_h + h, pw_h:pw_h + w, :] = input_0 for i in range(out_size_h): for j in range(out_size_w): out[:, :, i, j, :] = \ np.max(inputpad[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw, :], axis=(2, 3)) kerneled_shape_tmp = (inputpad.shape[0], inputpad.shape[1], kh * kw, inputpad.shape[4]) maxid = np.zeros(out_shape) for i in range(out_size_h): for j in range(out_size_w): maxid[:, :, i, j, :] = \ np.argmax(np.reshape( inputpad[:, :, i * sh:i * sh + kh, j * sw:j * sw + kw, :], kerneled_shape_tmp), axis=2) mask_shape_f = [n, c1, kh * kw, out_size_h, out_size_w, c0] mask = np.reshape(mask, tuple(mask_shape_f)) index_shape = [n, c1, 1, out_size_h, out_size_w, c0] def cal_num(shape): return reduce(lambda i, j: i * j, [shape[i] for i in range(len(shape))]) n_indexs = [i for i in range(n) for _ in range(cal_num(index_shape[1:]))] c1_indexs = [ i for i in range(c1) for _ in range(cal_num(index_shape[2:])) ] * n ho_indexs = [i for i in range(out_size_h) for _ in range(cal_num(index_shape[4:]))] * \ cal_num(index_shape[:3]) wo_indexs = [i for i in range(out_size_w) for _ in range(cal_num(index_shape[5:]))] * \ cal_num(index_shape[:4]) c0_indexs = list(range(c0)) * cal_num(index_shape[:-1]) mask[n_indexs, c1_indexs, maxid.flatten().astype(np.int32), ho_indexs, wo_indexs, c0_indexs] = 1 mask = np.reshape(mask, tuple(mask_shape)) out = out.astype(input_0.dtype) mask = mask.astype(input_0.dtype) return out, mask
def maxpool_ad(head, data, forward, mask, kernel, stride, pad, target=utils.CCE): """ automatic differentiate of maxpool with manual schedule. Supported Platforms: 'Ascend' """ shape = get_shape(data) dtype = data.dtype kernel_h, kernel_w = kernel stride_h, stride_w = stride [ph_h, _, pw_h, _], [out_size_h, out_size_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, pad) batch_size, input_c1, input_h, input_w, input_c0 = shape # tile size one is proved to be the most efficient one tile_scale_h = 1 tile_scale_w = 1 tile_h = stride_h * tile_scale_h if kernel_h == stride_h: # non-overlapping case tile_h_pad_u = ph_h % stride_h elif kernel_h % stride_h == 0: tile_h_pad_u = kernel_h - stride_h - ph_h else: tile_h_pad_u = kernel_h - kernel_h % stride_h - ph_h tile_h_pad_l = kernel_h - stride_h + ph_h tile_input_h = tile_h + tile_h_pad_u + tile_h_pad_l tile_h_out = (input_h - 1) // tile_h + 1 if ph_h % stride_h == 0: pad_output_h = ph_h // stride_h else: pad_output_h = ph_h // stride_h + 1 if tile_h_pad_u % stride_h == 0: pad_output_h -= tile_h_pad_u // stride_h else: pad_output_h -= tile_h_pad_u // stride_h + 1 tile_output_h = (tile_input_h - kernel_h) // stride_h + 1 tile_w = stride_w * tile_scale_w if kernel_w == stride_w: # non-overlapping case tile_w_pad_u = pw_h % stride_w elif kernel_w % stride_w == 0: tile_w_pad_u = kernel_w - stride_w - pw_h else: tile_w_pad_u = kernel_w - kernel_w % stride_w - pw_h tile_w_pad_l = kernel_w - stride_w + pw_h tile_input_w = tile_w + tile_w_pad_u + tile_w_pad_l tile_w_out = (input_w - 1) // tile_w + 1 if pw_h % stride_w == 0: pad_output_w = pw_h // stride_w else: pad_output_w = pw_h // stride_w + 1 if tile_w_pad_u % stride_w == 0: pad_output_w -= tile_w_pad_u // stride_w else: pad_output_w -= tile_w_pad_u // stride_w + 1 tile_output_w = (tile_input_w - kernel_w) // stride_w + 1 def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array): head_reshaped = akg.tvm.compute( (batch_size, input_c1, tile_h_out, tile_w_out, tile_output_h, tile_output_w, input_c0), lambda b, c1, h_out, w_out, oh, ow, c0: akg.tvm.expr.Select( akg.tvm.any( h_out * tile_scale_h + pad_output_h + oh < 0, h_out * tile_scale_h + pad_output_h + oh > out_size_h - 1, w_out * tile_scale_w + pad_output_w + ow < 0, w_out * tile_scale_w + pad_output_w + ow > out_size_w - 1), akg.tvm.const(0.0, dtype=dtype), head_(b, c1, h_out * tile_scale_h + pad_output_h + oh, w_out * tile_scale_w + pad_output_w + ow, c0)), name="head_reshaped") mask_reshaped = akg.tvm.compute( (batch_size, input_c1, tile_h_out, tile_w_out, tile_output_h, tile_output_w, kernel_h, kernel_w, input_c0), lambda b, c1, h_out, w_out, oh, ow, kh, kw, c0: akg.tvm.expr. Select( akg.tvm.any( h_out * tile_scale_h + pad_output_h + oh < 0, h_out * tile_scale_h + pad_output_h + oh > out_size_h - 1, w_out * tile_scale_w + pad_output_w + ow < 0, w_out * tile_scale_w + pad_output_w + ow > out_size_w - 1), akg.tvm.const(0.0, dtype=dtype), mask(b, c1, kh, kw, h_out * tile_scale_h + pad_output_h + oh, w_out * tile_scale_w + pad_output_w + ow, c0)), name="mask_reshaped") d_data = akg.tvm.compute( (batch_size, input_c1, tile_h_out, tile_w_out, tile_output_h, tile_output_w, kernel_h, kernel_w, input_c0), lambda b, c1, h_out, w_out, oh, ow, kh, kw, c0: mask_reshaped( b, c1, h_out, w_out, oh, ow, kh, kw, c0) * head_reshaped( b, c1, h_out, w_out, oh, ow, c0), name="d_data") data_reorg = akg.tvm.compute( (batch_size, input_c1, tile_h_out, tile_w_out, tile_output_h, tile_output_w, tile_h, tile_w, input_c0), lambda b, c1, h_out, w_out, oh, ow, h, w, c0: akg.tvm.expr.Select( akg.tvm.any(h + tile_h_pad_u < oh * stride_h, h + tile_h_pad_u > oh * stride_h + kernel_h - 1, w + tile_w_pad_u < ow * stride_w, w + tile_w_pad_u > ow * stride_w + kernel_w - 1), akg.tvm.const(0, dtype=dtype), d_data(b, c1, h_out, w_out, oh, ow, h + tile_h_pad_u - oh * stride_h, w + tile_w_pad_u - ow * stride_w, c0)), name="data_reorg") result_tile = akg.topi.sum(data_reorg, [4, 5]) result = akg.tvm.compute( shape, lambda b, c1, h, w, c0: result_tile( b, c1, h // tile_h, w // tile_w, h % tile_h, w % tile_w, c0), name="result") return [result] # override differentiation computation with custom function [dl_ddata ] = akg.differentiate(forward, [data], head, None, None, override={forward: ([data], custom_maxpool_fdiff)}) # schedule for differetiation operation s = akg.tvm.create_schedule([dl_ddata.op]) # get computations result = dl_ddata result_tile = result.op.input_tensors[0] data_reorg = result_tile.op.input_tensors[0] d_data = data_reorg.op.input_tensors[0] mask_reshaped = d_data.op.input_tensors[0] head_reshaped = d_data.op.input_tensors[1] def comp_func(s): data_ub = s.cache_read(mask, "local.UB", [mask_reshaped]) head_ub = s.cache_read(head, "local.UB", [head_reshaped]) result_ub = s.cache_write(result, "local.UB") s[d_data].set_scope("local.UB") s[data_reorg].set_scope("local.UB") s[mask_reshaped].set_scope("local.UB") s[head_reshaped].set_scope("local.UB") s[result_tile].set_scope("local.UB") s[result_ub].compute_inline() # inline inputs s[head_ub].compute_inline() s[data_ub].compute_inline() # result_tile dependencies s[data_reorg].compute_inline() b, c1, h_out, w_out, h, w, c0 = result_tile.op.axis oh, ow = result_tile.op.reduce_axis s[result_tile].reorder(b, c1, h_out, w_out, h, w, oh, ow, c0) s[d_data].compute_at(s[result_tile], w_out) s[mask_reshaped].compute_at(s[result_tile], w_out) s[head_reshaped].compute_at(s[result_tile], w_out) # tile result b, c1, h, w, c0 = result.op.axis h_out, h_in = s[result].split(h, tile_h) w_out, w_in = s[result].split(w, tile_w) s[result].reorder(b, c1, h_out, w_out, h_in, w_in, c0) s[result_tile].compute_at(s[result], w_out) return dl_ddata, comp_func
def avgpool_grad(x, dy, kernel, stride, pad): """ Gradient for avgpool. Args: x (tvm.tensor.Tensor): Forward input tensor of type float16. dy (tvm.tensor.Tensor): Gradient for forward output of type float16. kernel (Union[list, tuple]): Two int numbers for window size of H and W for pooling. stride (Union[list, tuple]): Two int numbers for stride size of H and W for pooling. pad (Union[str, list, tuple]): Padding strategy for pooling. Returns: Gradient of forward input tensor. """ dtype = x.dtype vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16) shape = get_shape(x) vc_util.check_shape(shape) if len(shape) != 5: raise RuntimeError("Only support 5-dim pooling!") if shape[-1] % 16 != 0: raise RuntimeError("Last shape must be divisible by 16!") if len(kernel) != 2: raise RuntimeError("Only support 2-dim kernel!") if len(stride) != 2: raise RuntimeError("Only support 2-dim stride!") if isinstance(pad, (list, tuple)) and len(pad) != 4: raise RuntimeError( "Only support string or list/tuple of 4 int numbers!") dim_info, _ = set_dim_func_(x, dy, kernel, stride, pad) attrs = {DIM: dim_info} @script def grad(zero, one_div_ksize, x, dy, kh, kw, sh, sw, ph_h, ph_t, pw_h, pw_t): tmpdx = allocate((x.shape[0], x.shape[1], x.shape[2] + ph_h + ph_t, x.shape[3] + pw_h + pw_t, x.shape[4]), x.dtype) dy_tmp = allocate(dy.shape, dy.dtype) dx = output_tensor(x.shape, x.dtype) for n in range(tmpdx.shape[0]): for c1 in range(tmpdx.shape[1]): for h in range(tmpdx.shape[2]): for w in range(tmpdx.shape[3]): for c0 in range(tmpdx.shape[4]): tmpdx[n, c1, h, w, c0] = zero for n in range(dy.shape[0]): for c1 in range(dy.shape[1]): for i in range(dy.shape[2]): for j in range(dy.shape[3]): for c0 in range(dy.shape[4]): dy_tmp[n, c1, i, j, c0] = dy[n, c1, i, j, c0] * one_div_ksize for ah in range(kh): for aw in range(kw): if dy.shape[2] == 1 and dy.shape[3] == 1: tmpdx[n, c1, i * sh + ah, j * sw + aw, c0] = dy_tmp[n, c1, i, j, c0] else: tmpdx[n, c1, i * sh + ah, j * sw + aw, c0] = \ tmpdx[n, c1, i * sh + ah, j * sw + aw, c0] + dy_tmp[n, c1, i, j, c0] if ph_h > 0 or ph_t > 0 or pw_h > 0 or pw_t > 0: for n in range(dx.shape[0]): for c1 in range(dx.shape[1]): for h in range(dx.shape[2]): for w in range(dx.shape[3]): for c0 in range(dx.shape[4]): dx[n, c1, h, w, c0] = tmpdx[n, c1, h + ph_h, w + pw_h, c0] return dx else: return tmpdx kh, kw = kernel sh, sw = stride [ph_h, ph_t, pw_h, pw_t], _ = cal_pad_shapes_by_strategy(shape, kernel, stride, pad) zero = akg.tvm.const(0.0, dtype=dtype) one_div_ksize = akg.tvm.const(1.0 / (kh * kw), dtype=dtype) params = [kh, kw, sh, sw, ph_h, ph_t, pw_h, pw_t] output = grad(zero, one_div_ksize, x, dy, *tuple(akg.tvm.convert(i) for i in params)) attrs["loop_partition_unroll"] = 1 return output, attrs
def maxpool_grad(x, y, dy, kernel, stride, pad): """ Performs the gradient of maxpool pooling on the input datas. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: x (tvm.tensor.Tensor): Tensor of type float16, float32. y (tvm.tensor.Tensor): Tensor, the maxpool result. dy (tvm.tensor.Tensor): Tensor, the gradient needed to be propagation. kernel (Union[List, Tuple]): two int numbers for pooling window's size. stride (Union[List, Tuple]): two int numbers for window's stride. pad (Union[String, List, Tuple]): padding, should be 'VALID','SAME' or instance of list(four int numbers, as 'CONSTANTS' strategy). Support **pad** is the same as avgpool's **Strategies**. Returns: Tensor as result for gradient of maxpooling. """ attrs = get_attrs() dim_info, _, attrs_info = maxpool_grad_set_dim_func( x, y, dy, kernel, stride, pad) attrs.update(attrs_info) attrs[DIM] = dim_info shape = get_shape(x) ori_dtype = x.dtype vc_util.ops_dtype_check(ori_dtype, vc_util.DtypeForDavinci.ALL_FLOAT) if utils.product_is_mini() and ori_dtype == 'float32': raise RuntimeError("Maxpool only support" "\'float16\' while platform is mini_v100!") dtype = ori_dtype if len(shape) != 5: raise ValueError("Only support 5-dim pooling!") if shape[-1] % 16 != 0: raise ValueError("Last shape must be divisible by 16!") if len(kernel) != 2: raise ValueError("Only support 2-dim kernel!") if len(stride) != 2: raise ValueError("Only support 2-dim stride!") if not isinstance(pad, str) \ and not (isinstance(pad, (list, tuple)) and len(pad) == 4): raise ValueError("Only support string or list/tuple of 4 int numbers!") vc_util.check_shape(shape) in_n, in_c1, in_h, in_w, in_c0 = shape k_h, k_w = kernel s_h, s_w = stride [ph_h, ph_t, pw_h, pw_t], [y_h, y_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, pad) k_h_hybrid = k_h k_w_hybrid = k_w yn = in_n yc1 = in_c1 yc0 = in_c0 @script(capture=locals()) def max_pool_grad_hybrid(zero_, one_, min_value_, x_, y_, dy_): x_dummy_ = allocate( (in_n, in_c1, ph_h + in_h + ph_t, pw_h + in_w + pw_t, in_c0), x_.dtype, "local") x_img_ = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0), x_.dtype, "local") y_img_ = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0), x_.dtype) mask_ = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0), x_.dtype) mask_new = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0), dy_.dtype) mask_res = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0), dy_.dtype) output_pre = allocate((yn, yc1, y_h, y_w, k_h_hybrid, k_w_hybrid, yc0), dy_.dtype) output_dummy_body = allocate( (in_n, in_c1, ph_h + in_h + ph_t, pw_h + in_w + pw_t, in_c0), dy_.dtype) output = output_tensor((in_n, in_c1, in_h, in_w, in_c0), dy_.dtype) for n in range(yn): for c1 in range(yc1): for h in range(y_h): for kh in range(k_h_hybrid): for iw in range(pw_h + in_w + pw_t): for c0 in range(yc0): x_dummy_[n, c1, h * s_h + kh, iw, c0] = min_value_ output_dummy_body[n, c1, h * s_h + kh, iw, c0] = zero_ for kh in range(k_h_hybrid): for iw in range(in_w): for c0 in range(yc0): if (h * s_h + kh >= ph_h and h * s_h + kh < in_h + ph_h): x_dummy_[n, c1, h * s_h + kh, iw + pw_h, c0] = \ x_[n, c1, h * s_h + kh - ph_h, iw, c0] for kh in range(k_h_hybrid): for iw in range(in_w): for c0 in range(yc0): if (h * s_h + kh >= ph_h and h * s_h + kh < in_h + ph_h): output_dummy_body[n, c1, h * s_h + kh, iw + pw_h, c0] = \ output[n, c1, h * s_h + kh - ph_h, iw, c0] for w in range(y_w): for kh in range(k_h_hybrid): for kw in range(k_w_hybrid): for c0 in range(yc0): x_img_[n, c1, h, w, kh, kw, c0] = \ x_dummy_[n, c1, h * s_h + kh, w * s_w + kw, c0] y_img_[n, c1, h, w, kh, kw, c0] = \ y_[n, c1, h, w, c0] mask_[n, c1, h, w, kh, kw, c0] = zero_ \ if x_img_[n, c1, h, w, kh, kw, c0] \ < y_img_[n, c1, h, w, kh, kw, c0] \ else one_ for kh in range(k_h_hybrid): for kw in range(k_w_hybrid): for c0 in range(yc0): mask_new[n, c1, h, w, kh, kw, c0] = zero_ for kh_0 in range(kh): for kw_0 in range(k_w_hybrid): for c0 in range(yc0): mask_new[n, c1, h, w, kh, kw, c0] = \ mask_new[n, c1, h, w, kh, kw, c0] \ + mask_[n, c1, h, w, kh_0, kw_0, c0] for kw_0 in range(kw + 1): for c0 in range(yc0): mask_new[n, c1, h, w, kh, kw, c0] = \ mask_new[n, c1, h, w, kh, kw, c0] \ + mask_[n, c1, h, w, kh, kw_0, c0] for kh in range(k_h_hybrid): for kw in range(k_w_hybrid): for c0 in range(yc0): mask_res[n, c1, h, w, kh, kw, c0] = \ zero_ \ if mask_new[n, c1, h, w, kh, kw, c0] \ > mask_[n, c1, h, w, kh, kw, c0] \ else mask_[n, c1, h, w, kh, kw, c0] output_pre[n, c1, h, w, kh, kw, c0] = \ mask_res[n, c1, h, w, kh, kw, c0] \ * dy_[n, c1, h, w, c0] output_dummy_body[n, c1, h * s_h + kh, w * s_w + kw, c0] += \ output_pre[n, c1, h, w, kh, kw, c0] for kh in range(k_h_hybrid): for iw in range(in_w): for c0 in range(yc0): if (h * s_h + kh >= ph_h and h * s_h + kh < in_h + ph_h): output[n, c1, h * s_h + kh - ph_h, iw, c0] = \ output_dummy_body[n, c1, h * s_h + kh, iw + pw_h, c0] return output zero = akg.tvm.const(0.0, dtype=dtype) one = akg.tvm.const(1.0, dtype=dtype) min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else -340282346638528859811704183484516925440.0, dtype=dtype) output = max_pool_grad_hybrid(zero, one, min_value, x, y, dy) return output, attrs
def maxpool_with_argmax(data, kernel, stride, strategy): """ Performs the max pooling on the input datas. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or instance of list(four int numbers, as 'CONSTANTS' strategy). Support **Strategies** is the same as avgpool. Returns: tvm.tensor.Tensor, result for gradient of maxpooling. """ attrs = get_attrs() dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride, strategy)[0] for k, v in attr_map_v2.items(): attrs[k] = v if dim_info != "": attrs['dim'] = dim_info attrs["custom_tiling"] = maxpool_with_argmax_tiling_strategy( data, kernel, stride, strategy) shape = get_shape(data) dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16) vc_util.check_shape(kernel, 2, 'Kernel') vc_util.check_shape(stride, 2, 'Stride') pad_strategy_check(strategy) kernel_h, kernel_w = kernel in_n, in_c1, _, _, in_c0 = shape [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad = [ph_h, ph_t, pw_h, pw_t] zero = akg.tvm.const(0.0, dtype=dtype) one = akg.tvm.const(1.0, dtype=dtype) min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else -340282346638528859811704183484516925440.0, dtype=dtype) # fmap img2col l1 -> ub in zZ format by fractal fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) fmap_img2col_ub = img2col(data, fmap_img2col_shape_ub, kernel_h, kernel_w, pad, stride, min_value, tag='') out_shape = (in_n, in_c1, out_h, out_w, in_c0) reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h") reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w") output = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: akg.tvm.max( fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0], axis=[reduce_axis_h, reduce_axis_w]), name="pooling_max") pooling_mask = akg.tvm.compute( fmap_img2col_shape_ub, lambda n, c1, kh, kw, oh, ow, c0: akg.tvm.if_then_else( fmap_img2col_ub[n, c1, kh, kw, oh, ow, c0] < output[ n, c1, oh, ow, c0], zero, one), name="pooling_mask") mask_flag = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0], name="mask_flag") mask_init = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0], name="mask_init") # spec 2 @script(capture=locals()) def hybrid_first_max(mask_, flag_, flag2_, zero_, one_): output_ = allocate( (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0), mask_.dtype, 'local') for n_i in range(in_n): for c1_i in range(in_c1): for oh_i in range(out_h): for ow_i in range(out_w): for c0_i in range(in_c0): output_[n_i, c1_i, 0, 0, oh_i, ow_i, c0_i] = flag2_[n_i, c1_i, oh_i, ow_i, c0_i] for kh_i in range(kernel_h): for kw_i in range(kernel_w): for oh_i in range(out_h): for ow_i in range(out_w): for c0_i in range(in_c0): output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \ mask_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] -\ flag_[n_i, c1_i, oh_i, ow_i, c0_i] output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \ max(output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i], zero_) flag_[n_i, c1_i, oh_i, ow_i, c0_i] =\ flag_[n_i, c1_i, oh_i, ow_i, c0_i] +\ output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] return output_ mask_first_max = hybrid_first_max(pooling_mask, mask_flag, mask_init, zero, one) return output, mask_first_max, attrs
def quantized_maxpool_tiling_strategy(data, kernel, stride, pad, quant_algo): """Custom tiling for quantized maxpool.""" batch, c_1, fm_h, fm_w, c_0 = get_shape(data) _, [out_h, out_w] = \ cal_pad_shapes_by_strategy(get_shape(data), kernel, stride, pad) strategy = list() if c_0 == 16: h_cut = out_h if fm_h >= 50 and fm_w >= 50: h_cut = 3 dim_ind = 0 tiling_params = list() if batch > 1: tiling_params.append([1, ct_util.TileConstraint.FACTOR, dim_ind]) dim_ind = dim_ind + 1 if c_1 > 1: tiling_params.append([1, ct_util.TileConstraint.FACTOR, dim_ind]) dim_ind = dim_ind + 1 tiling_params.append([h_cut, ct_util.TileConstraint.FACTOR, dim_ind]) tiling_params.append( ["H", ct_util.TileConstraint.SET_AXIS_INFO, dim_ind]) tiling_params.append( [out_w, ct_util.TileConstraint.FACTOR, dim_ind + 1]) if quant_algo is not None: tiling_params.append( [kernel[0], ct_util.TileConstraint.FACTOR, dim_ind + 2]) tiling_params.append( [kernel[1], ct_util.TileConstraint.FACTOR, dim_ind + 3]) tiling_params.append( [16, ct_util.TileConstraint.FACTOR, dim_ind + 4]) else: tiling_params.append( [kernel[0], ct_util.TileConstraint.FACTOR, dim_ind + 3]) tiling_params.append( [kernel[1], ct_util.TileConstraint.FACTOR, dim_ind + 4]) tiling_params.append( [16, ct_util.TileConstraint.FACTOR, dim_ind + 2]) for para in tiling_params: strategy += ct_util.create_constraint_on_axis(values=para[0], constraints=para[1], axis=para[2]) # if batch > 1: # strategy += ct_util.create_constraint_on_axis( # values=1, # constraints=ct_util.TileConstraint.FACTOR, # axis=dim_ind) # dim_ind = dim_ind + 1 # if c_1 > 1: # strategy += ct_util.create_constraint_on_axis( # values=1, # constraints=ct_util.TileConstraint.FACTOR, # axis=dim_ind) # dim_ind = dim_ind + 1 # strategy += ct_util.create_constraint_on_axis( # values=h_cut, # constraints=ct_util.TileConstraint.FACTOR, # axis=dim_ind) # strategy += ct_util.create_constraint_on_axis( # values="H", # constraints=ct_util.TileConstraint.SET_AXIS_INFO, # axis=dim_ind) # strategy += ct_util.create_constraint_on_axis( # values=out_w, # constraints=ct_util.TileConstraint.FACTOR, # axis=dim_ind+1) # strategy += ct_util.create_constraint_on_axis( # values=kernel[0], # constraints=ct_util.TileConstraint.FACTOR, # axis=dim_ind+2) # strategy += ct_util.create_constraint_on_axis( # values=kernel[1], # constraints=ct_util.TileConstraint.FACTOR, # axis=dim_ind+3) # strategy += ct_util.create_constraint_on_axis( # values=16, # constraints=ct_util.TileConstraint.FACTOR, # axis=dim_ind+4) return strategy
def maxpool_with_argmax_dynamic(data, kernel, stride, strategy): """ Performs the max pooling on the input datas. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or instance of list(four int numbers, as 'CONSTANTS' strategy). Support **Strategies** is the same as avgpool. Returns: tvm.tensor.Tensor, result for gradient of maxpooling. """ attrs = get_dynamic_attrs() dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride, strategy)[0] for k, v in attr_map_v2.items(): attrs[k] = v if dim_info != "": attrs['dim'] = dim_info # attrs["custom_tiling"] = maxpool_with_argmax_custom_tiling_strategy(data) attrs["enable_feature_library"] = True shape = get_shape(data) dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16) vc_util.check_shape(kernel, 2, 'Kernel') vc_util.check_shape(stride, 2, 'Stride') pad_strategy_check(strategy) kernel_h, kernel_w = kernel in_n, in_c1, _, _, in_c0 = shape [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad = [ph_h, ph_t, pw_h, pw_t] zero = akg.tvm.const(0.0, dtype=dtype) min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else -340282346638528859811704183484516925440.0, dtype=dtype) # fmap img2col l1 -> ub in zZ format by fractal fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) fmap_img2col_ub = img2col(data, fmap_img2col_shape_ub, kernel_h, kernel_w, pad, stride, min_value, tag='') out_shape = (in_n, in_c1, out_h, out_w, in_c0) reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h") reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w") output = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: akg.tvm.max( fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0], axis=[reduce_axis_h, reduce_axis_w]), name="pooling_max") zero = akg.tvm.const(0.0, dtype=dtype) mask_first_max_shape = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) mask_first_max = akg.tvm.compute(mask_first_max_shape, lambda *indice: zero, name="mask_first_max") attrs["custom_tiling"] = maxpool_with_argmax_dynamic_tensor_strategy( data, fmap_img2col_ub, mask_first_max) attrs["dynamic_shape"] = ds.set_dynamic_shape_limit_for_tensor( output, [64, 64], [2, 3]) return output, mask_first_max, attrs
def maxpool(data, kernel, stride, strategy): """ Performs the max pooling on the input data. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID', 'SAME' or instance of list(four int numbers for 'CONSTANTS' strategy). Support **Strategies** is same as avgpool. Returns: tvm.tensor.Tensor, as result for max pooling. """ attrs = attr_map attrs['dim'] = maxpool_set_dim_func(data, kernel, stride, strategy)[0] shape = get_shape(data) dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT) vc_util.check_shape(kernel, 2, "Kernel") vc_util.check_shape(stride, 2, "Stride") pad_strategy_check(strategy) kernel_h, kernel_w = kernel stride_h, stride_w = stride in_n, in_c1, in_h, in_w, in_c0 = shape [ph_h, _, pw_h, _], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) if attrs.get("dynamic") is True: # dynamic shape: although we can represent out_h and out_w using input shapes, they are too complicated out_h = akg.tvm.var("OUT_H") out_w = akg.tvm.var("OUT_W") @script(capture=locals()) def dynamic_max_pool_hybrid_0(zero_, one_, min_value_, x_, in_n, in_c1, in_h, in_w, in_c0, out_h, out_w): output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype) for n in range(in_n): for c1 in range(in_c1): # Head for ow in range(out_w): for c0 in range(in_c0): output[n, c1, 0, ow, c0] = min_value_ for kh in range(kernel_h): for kw in range(kernel_w): for ow in range(out_w): for c0 in range(in_c0): if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1: output[n, c1, 0, ow, c0] = \ max(output[n, c1, 0, ow, c0], x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0]) # Tail for oh in range(out_h - 1): for ow in range(out_w): for c0 in range(in_c0): output[n, c1, oh + 1, ow, c0] = min_value_ for kh in range(kernel_h): for kw in range(kernel_w): for oh in range(out_h - 1): for ow in range(out_w): for c0 in range(in_c0): if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1\ and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1: output[n, c1, oh + 1, ow, c0] = max( output[n, c1, oh + 1, ow, c0], x_[n, c1, (oh + 1) * stride_h + kh - ph_h, ow * stride_w + kw - pw_h, c0]) return output # static shape's hybrid @script(capture=locals()) def static_max_pool_hybrid_0(zero_, one_, min_value_, x_): output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype) for n in range(in_n): for c1 in range(in_c1): # Head for ow in range(out_w): for c0 in range(in_c0): output[n, c1, 0, ow, c0] = min_value_ for kh in range(kernel_h): for kw in range(kernel_w): for ow in range(out_w): for c0 in range(in_c0): if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1: output[n, c1, 0, ow, c0] = \ max(output[n, c1, 0, ow, c0], x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0]) # Tail for oh in range(out_h - 1): for ow in range(out_w): for c0 in range(in_c0): output[n, c1, oh + 1, ow, c0] = min_value_ for kh in range(kernel_h): for kw in range(kernel_w): for oh in range(out_h - 1): for ow in range(out_w): for c0 in range(in_c0): if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1 \ and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1: output[n, c1, oh + 1, ow, c0] = max( output[n, c1, oh + 1, ow, c0], x_[n, c1, (oh + 1) * stride_h + kh - ph_h, ow * stride_w + kw - pw_h, c0]) return output zero = akg.tvm.const(0.0, dtype=dtype) one = akg.tvm.const(1.0, dtype=dtype) min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else -340282346638528859811704183484516925440.0, dtype=dtype) if attrs.get("dynamic") is True: output = dynamic_max_pool_hybrid_0(zero, one, min_value, data, in_n, in_c1, in_h, in_w, in_c0, out_h, out_w) else: output = static_max_pool_hybrid_0(zero, one, min_value, data) return output, attrs
def avg_pool_5d_hybrid(a_value, kernel, stride, strategy): """avgpool with 5d case via hybrid""" kernel_h, kernel_w = kernel stride_h, stride_w = stride shape = get_shape(a_value) batch_size, c1_, in_size_h, in_size_w, c0_ = shape dtype = a_value.dtype if len(shape) != 5: raise ValueError("Only support 5-dim pooling!") if len(kernel) != 2: raise ValueError("Only support 2-dim kernel!") [pad_height_head, _, pad_width_head, _], [out_size_h, out_size_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) avg_pre = akg.tvm.const(1.0000 / (kernel_w * kernel_h), dtype=dtype) zero = akg.tvm.const(0.0, dtype=dtype) @script(capture=locals()) def avg_pool_hybrid(inputs, zero, avg_pre): output = output_tensor((batch_size, c1_, out_size_h, out_size_w, c0_), inputs.dtype) for n in range(batch_size): for c1 in range(c1_): # Head for ow in range(out_size_w): for c0 in range(c0_): output[n, c1, 0, ow, c0] = zero for ow in range(out_size_w): for kh in range(kernel_h): for kw in range(kernel_w): for c0 in range(c0_): if (kh >= pad_height_head) \ and (ow * stride_w + kw - pad_width_head >= 0) \ and (ow * stride_w + kw <= in_size_w + pad_width_head - 1): output[n, c1, 0, ow, c0] = output[n, c1, 0, ow, c0] +\ inputs[n, c1, kh - pad_height_head, ow * stride_w + kw - pad_width_head, c0] else: output[n, c1, 0, ow, c0] += zero for ow in range(out_size_w): for c0 in range(c0_): output[n, c1, 0, ow, c0] *= avg_pre # Tail for oh in range(out_size_h - 1): for ow in range(out_size_w): for c0 in range(c0_): output[n, c1, oh + 1, ow, c0] = zero for oh in range(out_size_h - 1): for ow in range(out_size_w): for kh in range(kernel_h): for kw in range(kernel_w): for c0 in range(c0_): if ((oh + 1) * stride_h + kh <= in_size_h + pad_height_head - 1)\ and (ow * stride_w + kw >= pad_width_head)\ and (ow * stride_w + kw <= in_size_w + pad_width_head - 1): output[n, c1, oh + 1, ow, c0] = output[n, c1, oh + 1, ow, c0] +\ inputs[n, c1, (oh + 1) * stride_h + kh - pad_height_head, ow * stride_w + kw - pad_width_head, c0] else: output[n, c1, oh + 1, ow, c0] += zero for oh in range(out_size_h - 1): for ow in range(out_size_w): for c0 in range(c0_): output[n, c1, oh + 1, ow, c0] *= avg_pre return output res_value = avg_pool_hybrid(a_value, zero, avg_pre) # set dim info = dim.Dim() # first part info.setdim(index=0, axis=0, tilel1=out_size_w, tilel0=0) # ow info.setdim(index=0, axis=1, tilel1=c0_, tilel0=0) # c0 info.setdim(index=0, axis=2, tilel1=kernel_h, tilel0=0) # kh # second part info.setdim(index=1, axis=0, tilel1=out_size_h - 1, tilel0=0) # oh-1 info.setdim(index=1, axis=1, tilel1=out_size_w, tilel0=0) # ow info.setdim(index=1, axis=2, tilel1=c0_, tilel0=0) # c0 info.setdim(index=1, axis=3, tilel1=kernel_h, tilel0=0) # kh info = str(info) attrs = {DIM: info} return res_value, attrs
def avgpool_with_img2col(data, kernel, stride, strategy): """ Performs the avgpool with img2col. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or instance of list(four int numbers, as 'CONSTANTS' strategy). Support **Strategies** is the same as avgpool. Returns: tvm.tensor.Tensor, result for gradient of avgpooling. """ shape = get_shape(data) dtype = data.dtype utils.davinci_format_check(shape, "NC1HWC0", dim=5) utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT16) utils.check_shape(kernel, 2, "Kernel") utils.check_shape(stride, 2, "Stride") kernel_h, kernel_w = kernel in_n, in_c1, _, _, in_c0 = shape [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad = [ph_h, ph_t, pw_h, pw_t] pad_value = zero_const(dtype) # fmap img2col l1 -> ub in zZ format by fractal fmap_img2col_shp_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) fmap_img2col_ub = img2col(data, fmap_img2col_shp_ub, kernel_h, kernel_w, pad, stride, pad_value, tag="") out_shape = (in_n, in_c1, out_h, out_w, in_c0) reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h") reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w") res_sum = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: akg.tvm.sum( fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0], axis=[reduce_axis_h, reduce_axis_w]), name="pooling_avg") dividor = akg.tvm.const(kernel_h * kernel_w, dtype) output = akg.tvm.compute(out_shape, lambda *i: res_sum(*i) / dividor, name="res_value") return output
def avgpool(data, kernel, stride, strategy, target=utils.CCE): """ Performs the average pooling on the input datas. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Support **Strategies**: .. hlist:: * VALID: will not pad, and drop tailed elements when pooling. Output shape will be `ceil((pool_shapes[i] - (kernel[i] - 1)) / stride[i])` > **example**: > params: inputs => 11, kernel width => 5, stride => 4 > inputs: 1 2 3 4 5 6 7 8 9 10 11 > 1st window contains: 1 2 3 4 5 > 2nd window contains: 5 6 7 8 9 > dropped: 10 11 * SAME: will pad with zero evenly each side, but will add extra to tail if the total padding amount is odd. Output shape will be `ceil(pool_shapes[i] / stride[i])` > **example**: > params: inputs => 10, kernel width => 5, stride => 4 > inputs: 1 2 3 4 5 6 7 8 9 10 > paded: 0(pad1) | 1 2 3 4 5 6 7 8 9 10 | 0(pad2) 0(pad3) > 1st window contains: 0(pad1) 1 2 3 4 > 2nd window contains: 4 5 6 7 8 > 3rd window contains: 8 9 10 0(pad2) 0(pad3) > dropped: None * CONSTANTS: will pad with zero according to given constants (also dropped tailed elements when pooling). > **example**: > params: inputs => 10, kernel width => 5, stride => 4, pad => (2, 2) > inputs: 1 2 3 4 5 6 7 8 9 10 > paded: 0(pad1) 0(pad2) | 1 2 3 4 5 6 7 8 9 10 | 0(pad2) 0(pad3) > 1st window contains: 0(pad1) 0(pad2) 1 2 3 > 2nd window contains: 3 4 5 6 7 > 3rd window contains: 7 8 9 10 0(pad3) > dropped: 0(pad4) Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): List or tuple of two int numbers for pooling window's size. stride (Union[list, tuple]): List or tuple of two int numbers for window's stride. strategy (Union[str, list, tuple]): A string or list or tuple for padding strategy, should be 'VALID', 'SAME' or instance of list(including four int numbers, as 'CONSTANTS' strategy). Returns: Tensor as result for average pooling. Supported Platforms: 'Ascend' """ dim_info, _ = avgpool_set_dim_func(data, kernel, stride, strategy) attrs = {DIM: dim_info} attrs['disable_half_to_float_sum_opt'] = True attrs['pragma_disable_whole_component'] = False shape = [x.value for x in data.shape] dtype = data.dtype utils.davinci_format_check(shape, "NC1HWC0", dim=5) utils.check_shape(kernel, 2, 'Kernel') utils.check_shape(stride, 2, 'Stride') if shape[2] > 60 and shape[3] > 60: return avg_pool_5d_hybrid(data, kernel, stride, strategy) kernel_h, kernel_w = kernel stride_h, stride_w = stride batch_size, c1, in_size_h, in_size_w, c0 = shape [pad_height_head, pad_height_tail, pad_width_head, pad_width_tail], [out_size_h, out_size_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad_shape = (batch_size, c1, in_size_h + pad_height_head + pad_height_tail, in_size_w + pad_width_head + pad_width_tail, c0) pad2d = akg.tvm.compute( pad_shape, lambda n, c1, h, w, c0: akg.tvm.if_then_else( akg.tvm. any(h < pad_height_head, h > in_size_h + pad_height_head - 1, w < pad_width_head, w > in_size_w + pad_width_head - 1), akg.tvm.const(0.0, dtype=dtype), data[n, c1, h - pad_height_head, w - pad_width_head, c0], ), name="pad2d") axis_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="axis_kernel_h") axis_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="axis_kernel_w") out_shape = (batch_size, c1, out_size_h, out_size_w, c0) dividor = akg.tvm.const(kernel_h * kernel_w, dtype) res = akg.tvm.compute(out_shape, lambda n, c1, h, w, c0: akg.tvm.sum( pad2d[n, c1, h * stride_h + axis_kernel_h, w * stride_w + axis_kernel_w, c0], axis=[axis_kernel_h, axis_kernel_w]), name="res") res_value = akg.tvm.compute( out_shape, lambda n, c1, h, w, c0: res[n, c1, h, w, c0] / dividor, name="res_value") return res_value, attrs