示例#1
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(
         placeholder_10: T.handle, placeholder_11: T.handle,
         placeholder_12: T.handle, T_cast_4: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1",
         "tir.noalias": True
     })
     placeholder_13 = T.match_buffer(placeholder_10, [360000],
                                     dtype="int16")
     placeholder_14 = T.match_buffer(placeholder_11, [36864], dtype="int16")
     placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32")
     T_cast_5 = T.match_buffer(T_cast_4, [360000], dtype="int16")
     # body
     PaddedInput_1 = T.allocate([379456], "int16", "global")
     for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64):
         PaddedInput_1[i0_i1_fused_1 * 4928 + i2_1 * 64 +
                       i3_1] = T.if_then_else(
                           1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76
                           and 1 <= i2_1 and i2_1 < 76,
                           placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 +
                                          i3_1 - 4864],
                           T.int16(0),
                           dtype="int16")
     for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625):
         Conv2dOutput_1 = T.allocate([64], "int32", "global")
         for ff_1 in T.serial(0, 64):
             Conv2dOutput_1[ff_1] = 0
             for ry, rx, rc_1 in T.grid(3, 3, 64):
                 Conv2dOutput_1[ff_1] = Conv2dOutput_1[ff_1] + T.cast(
                     PaddedInput_1[
                         T.floordiv(ax0_ax1_fused_ax2_fused_1, 75) * 4928 +
                         ry * 4928 + rx * 64 +
                         T.floormod(ax0_ax1_fused_ax2_fused_1, 75) * 64 +
                         rc_1], "int32") * T.cast(
                             placeholder_14[ry * 12288 + rx * 4096 +
                                            rc_1 * 64 + ff_1], "int32")
         for ax3_inner_2 in T.serial(0, 64):
             T_cast_5[ax0_ax1_fused_ax2_fused_1 * 64 +
                      ax3_inner_2] = T.cast(
                          T.cast(
                              T.max(
                                  T.min(
                                      T.q_multiply_shift(
                                          Conv2dOutput_1[ax3_inner_2] +
                                          placeholder_15[ax3_inner_2],
                                          1608879842,
                                          31,
                                          -7,
                                          dtype="int32"), 255), 0),
                              "uint8"), "int16")
示例#2
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(
         placeholder_62: T.handle, placeholder_63: T.handle,
         placeholder_64: T.handle, T_cast_20: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast",
         "tir.noalias": True
     })
     placeholder_65 = T.match_buffer(placeholder_62, [1, 224, 224, 3],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     placeholder_66 = T.match_buffer(placeholder_63, [7, 7, 3, 64],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     placeholder_67 = T.match_buffer(placeholder_64, [1, 1, 1, 64],
                                     dtype="int32",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     T_cast_21 = T.match_buffer(T_cast_20, [1, 112, 112, 64],
                                dtype="uint8",
                                elem_offset=0,
                                align=128,
                                offset_factor=1)
     # body
     PaddedInput_7 = T.allocate([157323], "int16", "global")
     for i0_i1_fused_7 in T.serial(0, 229):
         for i2_7, i3_7 in T.grid(229, 3):
             T.store(
                 PaddedInput_7,
                 (((i0_i1_fused_7 * 687) + (i2_7 * 3)) + i3_7),
                 T.if_then_else(
                     ((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and
                       (2 <= i2_7)) and (i2_7 < 226)),
                     T.load("int16", placeholder_65.data,
                            ((((i0_i1_fused_7 * 672) +
                               (i2_7 * 3)) + i3_7) - 1350)),
                     T.int16(0),
                     dtype="int16"), True)
     for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
         Conv2dOutput_7 = T.allocate([64], "int32", "global")
         for ff_3 in T.serial(0, 64):
             T.store(Conv2dOutput_7, ff_3, 0, True)
             for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
                 T.store(
                     Conv2dOutput_7, ff_3,
                     (T.load("int32", Conv2dOutput_7, ff_3) + (T.cast(
                         T.load("int16", PaddedInput_7, ((
                             (((T.floordiv(ax0_ax1_fused_ax2_fused_7, 112) *
                                1374) + (ry_2 * 687)) +
                              (T.floormod(ax0_ax1_fused_ax2_fused_7, 112) *
                               6)) +
                             (rx_2 * 3)) + rc_7)), "int32") * T.cast(
                                 T.load("int16", placeholder_66.data,
                                        ((((ry_2 * 1344) + (rx_2 * 192)) +
                                          (rc_7 * 64)) + ff_3)), "int32"))),
                     True)
         for ax3_inner_7 in T.serial(0, 64):
             T.store(
                 T_cast_21.data,
                 ((ax0_ax1_fused_ax2_fused_7 * 64) + ax3_inner_7),
                 T.cast(
                     T.max(
                         T.min(
                             T.q_multiply_shift(
                                 (T.load("int32", Conv2dOutput_7,
                                         ax3_inner_7) +
                                  T.load("int32", placeholder_67.data,
                                         ax3_inner_7)),
                                 1939887962,
                                 31,
                                 -9,
                                 dtype="int32"), 255), 0), "uint8"), True)
示例#3
0
def primfunc_global_allocates(placeholder_144: T.handle,
                              placeholder_145: T.handle,
                              placeholder_146: T.handle,
                              T_cast_48: T.handle) -> None:
    # function attr dict
    T.func_attr({
        "global_symbol":
        "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13",
        "tir.noalias": True
    })
    placeholder_147 = T.match_buffer(placeholder_144, [100352],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    placeholder_148 = T.match_buffer(placeholder_145, [4608],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    placeholder_149 = T.match_buffer(placeholder_146, [512],
                                     dtype="int32",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    T_cast_49 = T.match_buffer(T_cast_48, [100352],
                               dtype="int16",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
    # body
    PaddedInput_22 = T.allocate([131072], "int16", "global")
    DepthwiseConv2d_9 = T.allocate([100352], "int32", "global")
    for i1_29, i2_39, i3_40 in T.grid(16, 16, 512):
        PaddedInput_22[(((i1_29 * 8192) + (i2_39 * 512)) +
                        i3_40)] = T.if_then_else(
                            ((((1 <= i1_29) and (i1_29 < 15)) and
                              (1 <= i2_39)) and (i2_39 < 15)),
                            placeholder_147[((((i1_29 * 7168) +
                                               (i2_39 * 512)) + i3_40) -
                                             7680)],
                            T.int16(0),
                            dtype="int16")
    for i_9, j_9, c_9 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = 0
        for di_9, dj_9 in T.grid(3, 3):
            DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = (
                DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] +
                (PaddedInput_22[(((((i_9 * 8192) + (di_9 * 8192)) +
                                   (j_9 * 512)) +
                                  (dj_9 * 512)) + c_9)].astype("int32") *
                 placeholder_148[(((di_9 * 1536) +
                                   (dj_9 * 512)) + c_9)].astype("int32")))
    for ax1_27, ax2_28, ax3_30 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)] = (
            DepthwiseConv2d_9[(((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)] +
            placeholder_149[ax3_30])
    for i1_30, i2_40, i3_41 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i1_30 * 7168) + (i2_40 * 512)) +
                           i3_41)] = T.q_multiply_shift(
                               DepthwiseConv2d_9[(((i1_30 * 7168) +
                                                   (i2_40 * 512)) + i3_41)],
                               1269068532,
                               31,
                               -4,
                               dtype="int32")
    for i1_31, i2_41, i3_42 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i1_31 * 7168) + (i2_41 * 512)) + i3_42)] = T.max(
            T.max(
                DepthwiseConv2d_9[(((i1_31 * 7168) + (i2_41 * 512)) + i3_42)],
                255), 0)
    for ax1_28, ax2_29, ax3_31 in T.grid(14, 14, 512):
        PaddedInput_22[(((ax1_28 * 7168) + (ax2_29 * 512)) +
                        ax3_31)] = DepthwiseConv2d_9[(((ax1_28 * 7168) +
                                                       (ax2_29 * 512)) +
                                                      ax3_31)].astype("uint8")
    for ax1_29, ax2_30, ax3_32 in T.grid(14, 14, 512):
        T_cast_49[(((ax1_29 * 7168) + (ax2_30 * 512)) +
                   ax3_32)] = PaddedInput_22[(((ax1_29 * 7168) +
                                               (ax2_30 * 512)) +
                                              ax3_32)].astype("int16")
示例#4
0
def primfunc_local_allocates(placeholder_162: T.handle,
                             placeholder_163: T.handle,
                             placeholder_164: T.handle,
                             T_cast_76: T.handle) -> None:
    # function attr dict
    T.func_attr({
        "global_symbol":
        "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9",
        "tir.noalias": True
    })
    placeholder_165 = T.match_buffer(placeholder_162, [100352],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    placeholder_166 = T.match_buffer(placeholder_163, [4608],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    placeholder_167 = T.match_buffer(placeholder_164, [512],
                                     dtype="int32",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    T_cast_77 = T.match_buffer(T_cast_76, [100352],
                               dtype="int16",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
    # body
    PaddedInput_25 = T.allocate([131072], "int16", "global")
    for i1_35, i2_46, i3_47 in T.grid(16, 16, 512):
        PaddedInput_25[(((i1_35 * 8192) + (i2_46 * 512)) +
                        i3_47)] = T.if_then_else(
                            ((((1 <= i1_35) and (i1_35 < 15)) and
                              (1 <= i2_46)) and (i2_46 < 15)),
                            placeholder_165[((((i1_35 * 7168) +
                                               (i2_46 * 512)) + i3_47) -
                                             7680)],
                            T.int16(0),
                            dtype="int16")
    T_add_11 = T.allocate([100352], "int32", "global")
    with T.allocate([100352], "int32", "global") as DepthwiseConv2d_11:
        for i_11, j_11, c_11 in T.grid(14, 14, 512):
            DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] = 0
            for di_11, dj_11 in T.grid(3, 3):
                DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] = (
                    DepthwiseConv2d_11[(((i_11 * 7168) +
                                         (j_11 * 512)) + c_11)] +
                    (PaddedInput_25[(((((i_11 * 8192) + (di_11 * 8192)) +
                                       (j_11 * 512)) +
                                      (dj_11 * 512)) + c_11)].astype("int32") *
                     placeholder_166[(
                         ((di_11 * 1536) +
                          (dj_11 * 512)) + c_11)].astype("int32")))
        for ax1_44, ax2_45, ax3_47 in T.grid(14, 14, 512):
            T_add_11[(((ax1_44 * 7168) + (ax2_45 * 512)) +
                      ax3_47)] = (DepthwiseConv2d_11[((
                          (ax1_44 * 7168) + (ax2_45 * 512)) + ax3_47)] +
                                  placeholder_167[ax3_47])
    compute_22 = T.allocate([100352], "int32", "global")
    with T.allocate([100352], "int32", "global") as T_cast_78:
        for ax1_45, ax2_46, ax3_48 in T.grid(14, 14, 512):
            T_cast_78[(((ax1_45 * 7168) + (ax2_46 * 512)) +
                       ax3_48)] = T_add_11[(((ax1_45 * 7168) +
                                             (ax2_46 * 512)) + ax3_48)]
        for i1_36, i2_47, i3_48 in T.grid(14, 14, 512):
            compute_22[(((i1_36 * 7168) + (i2_47 * 512)) +
                        i3_48)] = T.q_multiply_shift(
                            T_cast_78[(((i1_36 * 7168) + (i2_47 * 512)) +
                                       i3_48)],
                            1948805937,
                            31,
                            -5,
                            dtype="int32")
    T_cast_79 = T.allocate([100352], "uint8", "global")
    with T.allocate([100352], "int32", "global") as compute_23:
        for i1_37, i2_48, i3_49 in T.grid(14, 14, 512):
            compute_23[(((i1_37 * 7168) + (i2_48 * 512)) + i3_49)] = T.max(
                T.max(compute_22[(((i1_37 * 7168) + (i2_48 * 512)) + i3_49)],
                      255), 0)
        for ax1_46, ax2_47, ax3_49 in T.grid(14, 14, 512):
            T_cast_79[(((ax1_46 * 7168) + (ax2_47 * 512)) +
                       ax3_49)] = compute_23[(((ax1_46 * 7168) +
                                               (ax2_47 * 512)) +
                                              ax3_49)].astype("uint8")
    for ax1_47, ax2_48, ax3_50 in T.grid(14, 14, 512):
        T_cast_77[(((ax1_47 * 7168) + (ax2_48 * 512)) +
                   ax3_50)] = T_cast_79[(((ax1_47 * 7168) + (ax2_48 * 512)) +
                                         ax3_50)].astype("int16")