def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1( placeholder_10: T.handle, placeholder_11: T.handle, placeholder_12: T.handle, T_cast_4: T.handle) -> None: # function attr dict T.func_attr({ "global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", "tir.noalias": True }) placeholder_13 = T.match_buffer(placeholder_10, [360000], dtype="int16") placeholder_14 = T.match_buffer(placeholder_11, [36864], dtype="int16") placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32") T_cast_5 = T.match_buffer(T_cast_4, [360000], dtype="int16") # body PaddedInput_1 = T.allocate([379456], "int16", "global") for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64): PaddedInput_1[i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1] = T.if_then_else( 1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625): Conv2dOutput_1 = T.allocate([64], "int32", "global") for ff_1 in T.serial(0, 64): Conv2dOutput_1[ff_1] = 0 for ry, rx, rc_1 in T.grid(3, 3, 64): Conv2dOutput_1[ff_1] = Conv2dOutput_1[ff_1] + T.cast( PaddedInput_1[ T.floordiv(ax0_ax1_fused_ax2_fused_1, 75) * 4928 + ry * 4928 + rx * 64 + T.floormod(ax0_ax1_fused_ax2_fused_1, 75) * 64 + rc_1], "int32") * T.cast( placeholder_14[ry * 12288 + rx * 4096 + rc_1 * 64 + ff_1], "int32") for ax3_inner_2 in T.serial(0, 64): T_cast_5[ax0_ax1_fused_ax2_fused_1 * 64 + ax3_inner_2] = T.cast( T.cast( T.max( T.min( T.q_multiply_shift( Conv2dOutput_1[ax3_inner_2] + placeholder_15[ax3_inner_2], 1608879842, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16")
def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast( placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({ "global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True }) placeholder_65 = T.match_buffer(placeholder_62, [1, 224, 224, 3], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_66 = T.match_buffer(placeholder_63, [7, 7, 3, 64], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_67 = T.match_buffer(placeholder_64, [1, 1, 1, 64], dtype="int32", elem_offset=0, align=128, offset_factor=1) T_cast_21 = T.match_buffer(T_cast_20, [1, 112, 112, 64], dtype="uint8", elem_offset=0, align=128, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): for i2_7, i3_7 in T.grid(229, 3): T.store( PaddedInput_7, (((i0_i1_fused_7 * 687) + (i2_7 * 3)) + i3_7), T.if_then_else( ((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), T.load("int16", placeholder_65.data, ((((i0_i1_fused_7 * 672) + (i2_7 * 3)) + i3_7) - 1350)), T.int16(0), dtype="int16"), True) for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544): Conv2dOutput_7 = T.allocate([64], "int32", "global") for ff_3 in T.serial(0, 64): T.store(Conv2dOutput_7, ff_3, 0, True) for ry_2, rx_2, rc_7 in T.grid(7, 7, 3): T.store( Conv2dOutput_7, ff_3, (T.load("int32", Conv2dOutput_7, ff_3) + (T.cast( T.load("int16", PaddedInput_7, (( (((T.floordiv(ax0_ax1_fused_ax2_fused_7, 112) * 1374) + (ry_2 * 687)) + (T.floormod(ax0_ax1_fused_ax2_fused_7, 112) * 6)) + (rx_2 * 3)) + rc_7)), "int32") * T.cast( T.load("int16", placeholder_66.data, ((((ry_2 * 1344) + (rx_2 * 192)) + (rc_7 * 64)) + ff_3)), "int32"))), True) for ax3_inner_7 in T.serial(0, 64): T.store( T_cast_21.data, ((ax0_ax1_fused_ax2_fused_7 * 64) + ax3_inner_7), T.cast( T.max( T.min( T.q_multiply_shift( (T.load("int32", Conv2dOutput_7, ax3_inner_7) + T.load("int32", placeholder_67.data, ax3_inner_7)), 1939887962, 31, -9, dtype="int32"), 255), 0), "uint8"), True)
def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.handle, placeholder_146: T.handle, T_cast_48: T.handle) -> None: # function attr dict T.func_attr({ "global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13", "tir.noalias": True }) placeholder_147 = T.match_buffer(placeholder_144, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_148 = T.match_buffer(placeholder_145, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1) T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) # body PaddedInput_22 = T.allocate([131072], "int16", "global") DepthwiseConv2d_9 = T.allocate([100352], "int32", "global") for i1_29, i2_39, i3_40 in T.grid(16, 16, 512): PaddedInput_22[(((i1_29 * 8192) + (i2_39 * 512)) + i3_40)] = T.if_then_else( ((((1 <= i1_29) and (i1_29 < 15)) and (1 <= i2_39)) and (i2_39 < 15)), placeholder_147[((((i1_29 * 7168) + (i2_39 * 512)) + i3_40) - 7680)], T.int16(0), dtype="int16") for i_9, j_9, c_9 in T.grid(14, 14, 512): DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = 0 for di_9, dj_9 in T.grid(3, 3): DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = ( DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] + (PaddedInput_22[(((((i_9 * 8192) + (di_9 * 8192)) + (j_9 * 512)) + (dj_9 * 512)) + c_9)].astype("int32") * placeholder_148[(((di_9 * 1536) + (dj_9 * 512)) + c_9)].astype("int32"))) for ax1_27, ax2_28, ax3_30 in T.grid(14, 14, 512): DepthwiseConv2d_9[(((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)] = ( DepthwiseConv2d_9[(((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)] + placeholder_149[ax3_30]) for i1_30, i2_40, i3_41 in T.grid(14, 14, 512): DepthwiseConv2d_9[(((i1_30 * 7168) + (i2_40 * 512)) + i3_41)] = T.q_multiply_shift( DepthwiseConv2d_9[(((i1_30 * 7168) + (i2_40 * 512)) + i3_41)], 1269068532, 31, -4, dtype="int32") for i1_31, i2_41, i3_42 in T.grid(14, 14, 512): DepthwiseConv2d_9[(((i1_31 * 7168) + (i2_41 * 512)) + i3_42)] = T.max( T.max( DepthwiseConv2d_9[(((i1_31 * 7168) + (i2_41 * 512)) + i3_42)], 255), 0) for ax1_28, ax2_29, ax3_31 in T.grid(14, 14, 512): PaddedInput_22[(((ax1_28 * 7168) + (ax2_29 * 512)) + ax3_31)] = DepthwiseConv2d_9[(((ax1_28 * 7168) + (ax2_29 * 512)) + ax3_31)].astype("uint8") for ax1_29, ax2_30, ax3_32 in T.grid(14, 14, 512): T_cast_49[(((ax1_29 * 7168) + (ax2_30 * 512)) + ax3_32)] = PaddedInput_22[(((ax1_29 * 7168) + (ax2_30 * 512)) + ax3_32)].astype("int16")
def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handle, placeholder_164: T.handle, T_cast_76: T.handle) -> None: # function attr dict T.func_attr({ "global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9", "tir.noalias": True }) placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1) T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) # body PaddedInput_25 = T.allocate([131072], "int16", "global") for i1_35, i2_46, i3_47 in T.grid(16, 16, 512): PaddedInput_25[(((i1_35 * 8192) + (i2_46 * 512)) + i3_47)] = T.if_then_else( ((((1 <= i1_35) and (i1_35 < 15)) and (1 <= i2_46)) and (i2_46 < 15)), placeholder_165[((((i1_35 * 7168) + (i2_46 * 512)) + i3_47) - 7680)], T.int16(0), dtype="int16") T_add_11 = T.allocate([100352], "int32", "global") with T.allocate([100352], "int32", "global") as DepthwiseConv2d_11: for i_11, j_11, c_11 in T.grid(14, 14, 512): DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] = 0 for di_11, dj_11 in T.grid(3, 3): DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] = ( DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] + (PaddedInput_25[(((((i_11 * 8192) + (di_11 * 8192)) + (j_11 * 512)) + (dj_11 * 512)) + c_11)].astype("int32") * placeholder_166[( ((di_11 * 1536) + (dj_11 * 512)) + c_11)].astype("int32"))) for ax1_44, ax2_45, ax3_47 in T.grid(14, 14, 512): T_add_11[(((ax1_44 * 7168) + (ax2_45 * 512)) + ax3_47)] = (DepthwiseConv2d_11[(( (ax1_44 * 7168) + (ax2_45 * 512)) + ax3_47)] + placeholder_167[ax3_47]) compute_22 = T.allocate([100352], "int32", "global") with T.allocate([100352], "int32", "global") as T_cast_78: for ax1_45, ax2_46, ax3_48 in T.grid(14, 14, 512): T_cast_78[(((ax1_45 * 7168) + (ax2_46 * 512)) + ax3_48)] = T_add_11[(((ax1_45 * 7168) + (ax2_46 * 512)) + ax3_48)] for i1_36, i2_47, i3_48 in T.grid(14, 14, 512): compute_22[(((i1_36 * 7168) + (i2_47 * 512)) + i3_48)] = T.q_multiply_shift( T_cast_78[(((i1_36 * 7168) + (i2_47 * 512)) + i3_48)], 1948805937, 31, -5, dtype="int32") T_cast_79 = T.allocate([100352], "uint8", "global") with T.allocate([100352], "int32", "global") as compute_23: for i1_37, i2_48, i3_49 in T.grid(14, 14, 512): compute_23[(((i1_37 * 7168) + (i2_48 * 512)) + i3_49)] = T.max( T.max(compute_22[(((i1_37 * 7168) + (i2_48 * 512)) + i3_49)], 255), 0) for ax1_46, ax2_47, ax3_49 in T.grid(14, 14, 512): T_cast_79[(((ax1_46 * 7168) + (ax2_47 * 512)) + ax3_49)] = compute_23[(((ax1_46 * 7168) + (ax2_47 * 512)) + ax3_49)].astype("uint8") for ax1_47, ax2_48, ax3_50 in T.grid(14, 14, 512): T_cast_77[(((ax1_47 * 7168) + (ax2_48 * 512)) + ax3_50)] = T_cast_79[(((ax1_47 * 7168) + (ax2_48 * 512)) + ax3_50)].astype("int16")