Пример #1
0
def globalavgpool(n, c, h, w, pool_type, attrs, kernel_name="global_pool"):
    """
    Performs the global average pooling on the input. For each feature map we can define the formula as:
    \f[
     res = \frac{1}{W * H} \\sum X_{i,j}
    \f]
    Note:
        The real input is create by akg.tvm.placeholder
    Args:
        n (int): input batchsize.
        c (int): input channel.
        h (int): input height.
        w (int): input weight.
        pool_type (str): pooling mode, default average.
        attrs (str): Default None.
        kernel_name (str): a str about kernel_name

    Returns:
            tvm.tensor.Tensor of shape n * c * 1 * 1
    """

    input = akg.tvm.placeholder((n, c, h, w), name='input', dtype="float16")
    output = akg.topi.nn.global_pool(input, pool_type=pool_type)
    s = akg.tvm.create_schedule(output.op)
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [input, output],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
    return mod
Пример #2
0
def Gather(params_shape,
           indices_shape,
           params_dtype,
           indices_dtype,
           axis,
           kernel_name,
           cce_path="./",
           target=utils.CCE):
    """Gather data by indices"""
    utils.check_shape(params_shape, length=2)
    utils.check_shape(indices_shape, length=1)
    utils.ops_dtype_check(params_dtype, utils.DtypeForDavinci.ALL_TYPES)
    utils.ops_dtype_check(indices_dtype, utils.DtypeForDavinci.INT32)
    utils.check_equal("axis", "zero", axis, 0)

    # construct compute
    o_shape = (indices_shape[0], params_shape[1])
    xx = akg.tvm.placeholder(params_shape, dtype=params_dtype, name="X")
    yy = akg.tvm.placeholder(indices_shape, dtype=indices_dtype, name="Y")
    res = akg.tvm.extern(o_shape, [xx, yy],
                         lambda ins, outs: kernel_ir(outs[0], ins[0], ins[1]),
                         name="res",
                         dtype=params_dtype)
    s = akg.tvm.create_schedule(res.op)

    # create cce
    attrs = {"enable_multicore": False}
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [xx, yy, res], "cce", name=kernel_name, attrs=attrs)

    source_code = mod.imported_modules[0].get_source()
    create_code(kernel_name, cce_path, source_code)

    return mod
Пример #3
0
def matmul_ad(data_shape, weight_shape, dtype, attrs=None):
    check_list = ["float16"]
    if not (dtype.lower() in check_list):
        raise RuntimeError("matmul test only support %s while dtype is %s" %
                           (",".join(check_list), dtype))
    # check_shape(shape)
    assert (len(data_shape) == 2)
    assert (len(weight_shape) == 2)
    assert (data_shape[1] == weight_shape[0])

    m, k = data_shape
    _, n = weight_shape

    a = akg.tvm.placeholder((m, k), name='a', dtype=dtype)
    b = akg.tvm.placeholder((k, n), name='b', dtype=dtype)
    kk = akg.tvm.reduce_axis((0, k), name='kk')
    c = akg.tvm.compute(
        (m, n),
        lambda i, j: akg.lang.ascend.mmad(a[i, kk] * b[kk, j], axis=kk),
        name="c")

    head = akg.tvm.placeholder(c.shape, name="Head", dtype='float16')
    _jacs = list(akg.differentiate(c, [a], head))
    sjac = akg.tvm.create_schedule([_jacs[0].op])
    op_vars = [head, b, _jacs[0]]

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(sjac,
                        op_vars,
                        "cce",
                        name="test2",
                        attrs=attrs,
                        polyhedral=True)
        return mod
Пример #4
0
def topk(shape, k, dtype, kernel_name, attrs, target="cce"):
    check_list = ["float16", "int32"]
    if not (dtype.lower() in check_list):
        raise RuntimeError("tile_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))
    if k > shape[-1]:
        raise RuntimeError("k should not be greater than shape[-1]")

    shape = (16, 16)
    out_shape = (16, 16)
    temp_shape = (16, 16 * 18)
    inputs = akg.tvm.placeholder(shape, name="input", dtype="float16")
    output = akg.tvm.placeholder(out_shape, name="output", dtype="float16")
    temp = akg.tvm.placeholder(temp_shape, name="temp", dtype="float16")

    values = compute_topk(output, inputs, temp)
    values1 = compute_get_last(values, temp)

    s = akg.tvm.create_schedule([values1.op])
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [inputs, values1],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
        return mod
Пример #5
0
def case_1(data_shape, dtype, kernel_name, attrs):
    """elemwise chain case 1"""
    utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT16)
    utils.check_shape_length_equal("data", data_shape, 2)

    m, k = data_shape

    A = akg.tvm.placeholder((m, k), name='A', dtype=dtype)
    B = akg.tvm.placeholder((k, ), name='B', dtype=dtype)
    C = akg.tvm.placeholder((m, k), name='C', dtype=dtype)

    E = akg.tvm.compute((m, k),
                        lambda i, j: A[i, j] * (B[j] + C[i, j]),
                        name="E")

    forward_s = akg.tvm.create_schedule(E.op)
    op_vars = [A, B, C, E]
    akg.lower(forward_s, op_vars, simple_mode=True, polyhedral=True)

    kernel_name = gen_name_kernel(kernel_name, dtype, data_shape)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(forward_s,
                        op_vars,
                        "cce",
                        name="test",
                        attrs=attrs,
                        polyhedral=True)
        return mod
Пример #6
0
def invert_permutation_run(shape, dtype, attrs):
    # check shapes
    vc_util.check_shape(shape)

    if not (dtype.lower() in "int32"):
        raise RuntimeError(
            "indices_dtype only support int32 while dtype is %s" % dtype)

    A = akg.tvm.placeholder(shape, dtype, name="A")
    op = invert_permutation.invert_permutation(A)
    s = akg.tvm.create_schedule(op.op)

    kernel_name = utils.gen_name_kernel("invert_permutation", dtype, shape)
    with akg.build_config(add_lower_pass=utils.debug_mode(0),
                          dump_pass_ir=True):
        mod = akg.build(s, [A, op],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)

    input_data = np.random.permutation(np.arange(shape[0])).astype(np.int32)
    expect = np.full([shape[0]], 0, np.int32)
    for i, e in enumerate(input_data):
        expect[e] = i

    output = np.full([shape[0]], 0, np.int32)
    output = utils.mod_launch(mod, (input_data, output), expect=expect)

    return (input_data, ), output, expect, compare_tensor(output,
                                                          expect,
                                                          rtol=5e-03,
                                                          equal_nan=True)
Пример #7
0
def roipool(shape,
            roibox,
            pooled_shape,
            dtype,
            kernel_name="roipool_forward_output",
            attrs=None,
            target="cce"):
    check_list = ["float16"]
    if not (dtype.lower() in check_list):
        raise RuntimeError("tile_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))
    utils.check_shape(shape)
    assert (len(shape) == 4)
    assert (len(roibox) == 4)
    assert (len(pooled_shape) == 2)

    a_n, a_c, a_h, a_w = shape
    roi_t, roi_b, roi_l, roi_r = roibox
    assert (roi_t >= 0 and roi_t < roi_b and roi_b < a_h)
    assert (roi_l >= 0 and roi_l < roi_r and roi_r < a_w)

    a = akg.tvm.placeholder(shape, name="a", dtype=dtype)
    Crop = akg.tvm.compute([a_n, a_c, roi_b - roi_t, roi_r - roi_l],
                           lambda n, c, h, w: a[n, c, roi_t + h, roi_l + w])

    p_h, p_w = pooled_shape
    win_h = (roi_b - roi_t) // p_h + (1 if (roi_b - roi_t) % p_h > 0 else 0)
    win_w = (roi_r - roi_l) // p_w + (1 if (roi_r - roi_l) % p_w > 0 else 0)

    assert p_h <= (roi_b - roi_t) and p_w <= (roi_r - roi_l)

    Unpooled = akg.tvm.compute(
        [a_n, a_c, p_h, p_w, win_h, win_w],
        lambda n, c, h, w, wh, ww: akg.tvm.expr.Select(
            akg.tvm.all(h * win_h + wh < roi_b - roi_t, w * win_w + ww < roi_r
                        - roi_l), Crop[n, c, h * win_h + wh, w * win_w + ww],
            akg.tvm.const(0, a.dtype)))

    rh = akg.tvm.reduce_axis((0, win_h))
    rw = akg.tvm.reduce_axis((0, win_w))
    output_shape = [a_n, a_c, p_h, p_w]
    res = akg.tvm.compute(
        output_shape,
        lambda n, c, h, w: akg.tvm.max(Unpooled[n, c, h, w, rh, rw],
                                       axis=[rh, rw]))
    s = akg.tvm.create_schedule(res.op)
    s[Crop].compute_inline()
    s[Unpooled].compute_inline()
    kernel_name = utils.gen_name_kernel(kernel_name, dtype, shape)
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [a, res],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
        return mod, output_shape
Пример #8
0
def div_mod_issue(data_shape, weight_shape, case_number):

    if (case_number == 0):
        A = akg.tvm.placeholder(data_shape, dtype='float16', name='input0')
        divisor = 2
        stage1 = akg.tvm.compute(
            data_shape,
            lambda n, c, h, w: A[n, c / divisor, h, w] + 1,
            name="stage1")
        op_vars = [A, stage1]
        s = akg.tvm.create_schedule([stage1.op])
        akg.lower(s, op_vars, simple_mode=True, polyhedral=True)
        with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
            mod = akg.build(s, op_vars, "cce", name="test1", polyhedral=True)
        return mod
    else:
        A = akg.tvm.placeholder(data_shape, dtype='float16', name='input0')
        B = akg.tvm.placeholder(weight_shape, dtype='float16', name='input1')

        divisor = 3
        stage1 = akg.tvm.compute(
            data_shape,
            lambda n, c, h, w: A[n, c / divisor, h, w] + 1,
            name="stage1")
        stage2 = akg.tvm.compute(
            weight_shape,
            lambda n, c, h, w: stage1[0, c, 0, 0] + B[n, c, h, w],
            name="stage2")
        op_vars = [A, B, stage2]

        s = akg.tvm.create_schedule([stage2.op])
        akg.lower(s, op_vars, simple_mode=True, polyhedral=True)

        with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
            mod_stage2 = akg.build(s,
                                   op_vars,
                                   "cce",
                                   name="test2",
                                   polyhedral=True)
        return mod_stage2
Пример #9
0
def fc(fMapBatch, weight, fc_dtype, block_size, attrs, kernel_name="Fully_Connected"):
    """
    Computes full connection.

    Args:
        fMapBatch(akg.tvm.Tensor): Should be a 4D tensor.
        weight(akg.tvm.Tensor): Should be a 4D tensor of same type as fMapBatch.
        fc_dtype(str): Specifies data type of input tensors.
        block_size(int): Block size.
        attrs(dicts): Attributes.
        kernel_name(str): Kernel name.

    Returns:
        akg.tvm.Tensor of same type as input tensors.
    """
    # NCHW
    f_n, f_c, f_h, f_w = fMapBatch.shape
    w_n, w_c, w_h, w_w = weight.shape

    if f_c != w_c or f_h != w_h or f_w != w_w or w_n < 32:
        raise RuntimeError("invalid input shape")
    f_shape_nc1hwc0 = (f_n, f_c // block_size, f_h, f_w, block_size)

    w_shape_fractal = (w_c // block_size * w_h * w_w, w_n // block_size, block_size, block_size)

    A = akg.tvm.placeholder(f_shape_nc1hwc0, dtype=fc_dtype, name='fmap')
    B = akg.tvm.placeholder(w_shape_fractal, dtype=fc_dtype, name='weight')

    out_shape_nc1hwc0 = (f_n, w_n // block_size, 1, 1, block_size)

    weight_shape_nc1hwc0 = (w_n, w_c // block_size, w_h, w_w, block_size)

    _, k_c1, k_h, k_w, k_c0 = weight_shape_nc1hwc0

    kc1 = akg.tvm.reduce_axis((0, k_c1), name='kc1')
    kh = akg.tvm.reduce_axis((0, k_h), name='kh')
    kw = akg.tvm.reduce_axis((0, k_w), name='kw')
    kc0 = akg.tvm.reduce_axis((0, k_c0), name='kc0')

    res = akg.tvm.compute(out_shape_nc1hwc0,
                      lambda n, c1, h, w, c0: akg.lang.ascend.mmad(
                          A[n, kc1, (h + kh), (w + kw), kc0]
                          * B[(kc1 * k_h + kh) * k_w + kw, c1, c0, kc0],
                          axis=[kc1, kh, kw, kc0]), name="res")

    s = akg.tvm.create_schedule(res.op)
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [A, B, res], "cce", name=kernel_name, attrs=attrs, polyhedral=True)

    return mod
Пример #10
0
def concat_ad_run(shapes, dtype, axis, attrs):
    # prepare inputs placeholder
    inp_dtype = dtype.lower()
    data = []
    for i in range(len(shapes)):
        shape = shapes[i]
        data.append(
            akg.tvm.placeholder(shape, name="data_%d" % i, dtype=inp_dtype))

    kernel_name = utils.genKernelName("concat", inp_dtype, shapes)
    res, head = concat_ad.concat_ad(data, axis)

    opvars = [head] + data + [res]
    s = akg.tvm.create_schedule(res.op)
    op_attrs = [axis]

    if 'tuning' in attrs.keys():
        t = attrs.get("tuning", False)
        kernel_name = attrs.get("kernel_name", False)
        mod = utils.op_build_test(concat_ad.concat_ad, [shapes],
                                  [dtype.lower()],
                                  op_attrs,
                                  kernel_name=kernel_name,
                                  attrs=attrs,
                                  tuning=t)
        if t:
            args, expect, head_data, inputs = gen_data(dtype, head, shapes)
            return mod, expect, tuple(args)
        else:
            return mod
    else:
        # build the cce kernel
        with akg.build_config(add_lower_pass=utils.debug_mode(0),
                              dump_pass_ir=True):
            mod = akg.build(s,
                            opvars,
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
        print(mod.imported_modules[0].get_source())

        args, expect, head_data, inputs = gen_data(dtype, head, shapes)
        output = utils.mod_launch(mod, tuple(args), expect=expect)
        return tuple(inputs) + (head_data, ), output, expect, compare_tensor(
            output, expect, rtol=5e-03, equal_nan=True)
Пример #11
0
def elemwise_sum_manual_schedule(input_shape, polyhedral=False, attrs=None):
    """manually schedule"""
    b = akg.tvm.placeholder(input_shape, dtype='float16', name="b")
    c = akg.tvm.placeholder(input_shape, dtype='float16', name="c")
    a = akg.tvm.compute(input_shape, lambda *indices: b(*indices) + c(*indices))
    ss = akg.tvm.create_schedule([a.op])
    ss.cache_read(b, "local.UB", [a])
    ss.cache_read(c, "local.UB", [a])
    ss.cache_write(a, "local.UB")
    ss[a].set_scope("local.UB")
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(ss,
                    [b, c, a],
                    "cce",
                    name="test_manual_schedule",
                    attrs=attrs,
                    polyhedral=polyhedral)
    return mod
Пример #12
0
def floormod(shape, dtype, kernel_name, attrs, target="cce"):
    """
    Compute element-wise remainder of division.
    \f$res=a - floor(a/b) * b\f$

    Args:
         shape (list): a list has any nums.
         dtype (str): parameters' type.
         kernel_name (str): a str about kernel_name.
         attrs (str): Default None.
    Returns:
            tvm.tensor.Tensor, shape and dtype are input params.
    """

    utils.ops_dtype_check(
        dtype, [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32])
    utils.check_shape(shape)

    a = akg.tvm.placeholder(shape=shape, name="a", dtype=dtype)
    b = akg.tvm.placeholder(shape=shape, name="b", dtype=dtype)

    # res = a - floor(a/b) * b
    # Newton's Method for VREC
    para = akg.lang.ascend.vrec(b)
    for _ in range(3):
        tmp1 = akg.lang.ascend.vmul(b, para)
        tmp2 = akg.lang.ascend.vmuls(tmp1, -1)
        tmp3 = akg.lang.ascend.vadds(tmp2, 2)
        para = akg.lang.ascend.vmul(tmp3, para)

    c = akg.lang.ascend.vmul(a, para)
    d = akg.lang.ascend.floor(c)
    e = akg.lang.ascend.vmul(d, b)
    res = akg.lang.ascend.vsub(a, e)

    s = akg.tvm.create_schedule(res.op)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [a, b, res],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
        return mod
Пример #13
0
def test_quant(fmap_shape):
    # input shape(NCHW -> NC1HWC0)
    in_n, in_c, in_h, in_w = fmap_shape
    assert in_c % 32 == 0
    input_shape_nc1hwc0 = (in_n, in_c // 16, in_h, in_w, 16)
    in_n, in_c1, in_h, in_w, in_c0 = input_shape_nc1hwc0

    # placeholder (NC1HWC0)
    FMap = akg.tvm.placeholder(input_shape_nc1hwc0,
                               dtype='float16',
                               name='FMap')

    ScaleQ = akg.tvm.placeholder((16, ), dtype='float16', name='ScaleQ')
    OffsetQ = akg.tvm.placeholder((16, ), dtype='float16', name='OffsetQ')

    out_shape_nc1hwc0 = (in_n, in_c // 32, in_h, in_w, 32)
    print(out_shape_nc1hwc0)
    out_n, out_c1, out_h, out_w, out_c0 = out_shape_nc1hwc0

    # quantize
    Quant = akg.tvm.compute(out_shape_nc1hwc0,
                            lambda n, c1, h, w, c0:
                            (FMap[n, c1 + c0 // 16, h, w, c0 % 16] * ScaleQ[0]
                             + OffsetQ[0]).astype('int8'),
                            name='output')

    info = dim.Dim()
    info.setdim(index=0, axis=0, tilel1=2, tilel0=0)
    info.setdim(index=0, axis=0, tilel1=32, tilel0=0)
    info.setdim(index=0, axis=0, tilel1=32, tilel0=0)
    info.setdim(index=0, axis=0, tilel1=16, tilel0=0)

    # schedule
    s = akg.tvm.create_schedule(Quant.op)
    with akg.build_config(add_lower_pass=utils.debug_mode(0),
                          dump_pass_ir=True):
        mod = akg.build(s, [FMap, ScaleQ, OffsetQ, Quant],
                        'cce',
                        name='cce_quant',
                        attrs={'dim': str(info)},
                        polyhedral=True)

    source_code = mod.imported_modules[0].get_source()
    print(source_code)
Пример #14
0
def my_dsl(dtype, kernel_name, attrs):
    m = tvm.var("M")
    n = tvm.var("N")
    A = tvm.placeholder((m,), name="A", dtype=dtype)
    B = tvm.placeholder((m,), name="B", dtype=dtype)

    if insn == "add":
        C = topi.add(A, B)
    elif insn == "sub":
        C = topi.subtract(A, B)
    if insn == "mul":
        C = topi.multiply(A, B)
    elif insn == "div":
        C = topi.divide(A, B)
    elif insn == "max":
        C = topi.maximum(A, B)
    elif insn == "min":
        C = topi.minimum(A, B)

    elif insn == "abs":
        C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C')
    elif insn == "exp":
        C = topi.exp(A)
    elif insn == "log":
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)

    elif insn == "adds":
        C = A + tvm.const(2, dtype)
    elif insn == "muls":
        C = A * tvm.const(2, dtype)

    # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C")
    s = tvm.create_schedule([C.op])
    with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True):
        if insnType == "binary":
            mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs = attrs, polyhedral=True)
        else:
            mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs = attrs, polyhedral=True)
    return mod
Пример #15
0
def add_a_conv(fmap_shape,
               filter_shape,
               pad_,
               stride_,
               dilation_,
               tile_hh=0,
               tile_coco=0,
               tile_mm=0,
               tile_kk=0,
               tile_nn=0,
               bypass_l1=False,
               use_bias=False,
               block_size=16,
               conv_dtype='float16'):
    conv, a_value, b_value, bias_value, kernel_name, dim_info = add_a_conv_compute(
        fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh, tile_coco,
        tile_mm, tile_kk, tile_nn, bypass_l1, use_bias, block_size, conv_dtype)
    # schedule
    s = akg.tvm.create_schedule(conv.op)
    print(conv, a_value, b_value, bias_value)

    attrs = {}
    attrs["pragma_rmselfdep"] = False
    attrs['dim'] = dim_info
    with akg.build_config(add_lower_pass=utils.debug_mode(0),
                          dump_pass_ir=True):

        if use_bias:
            mod = akg.build(s, [a_value, b_value, bias_value, conv],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
        else:
            mod = akg.build(s, [a_value, b_value, conv],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
    source_code = mod.imported_modules[0].get_source()
    cce_path = '.'
    utils.create_code(kernel_name, cce_path, source_code)

    return mod
Пример #16
0
def conv_relu(fmap_shape,
              filter_shape,
              pad_,
              stride_,
              dilation_,
              tile_hh=0,
              tile_coco=0,
              tile_mm=0,
              tile_kk=0,
              tile_nn=0,
              bypass_l1=False,
              use_bias=False,
              block_size=16,
              conv_dtype='float16'):
    conv, a_value, b_value, bias_value, kernel_name, dim_info = add_a_conv_compute(
        fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh, tile_coco,
        tile_mm, tile_kk, tile_nn, bypass_l1, use_bias, block_size, conv_dtype)
    # leakly relu
    negative_slope = 0.0
    slope_tmp = akg.tvm.const(negative_slope, dtype=conv_dtype)
    # negative_slope*x
    out = akg.lang.ascend.vmuls(conv, slope_tmp)
    # max(x,negative_slope*x)
    out = akg.lang.ascend.vmax(out, conv)
    # schedule
    s = akg.tvm.create_schedule(conv.op)
    with akg.build_config(add_lower_pass=utils.debug_mode(0),
                          dump_pass_ir=True):

        if use_bias:
            mod = akg.build(s, [a_value, b_value, bias_value, conv],
                            "cce",
                            name=kernel_name,
                            attrs={"dim": dim_info},
                            polyhedral=True)
        else:
            mod = akg.build(s, [a_value, b_value, conv],
                            "cce",
                            name=kernel_name,
                            attrs={"dim": dim_info},
                            polyhedral=True)
    return mod
Пример #17
0
def op_build_to_func(opnames, computes, args, custom_schedule, device,
                     kernel_name, attrs):
    """op_build_to_func"""
    if device not in ("aicore", "aicpu"):
        logging.error("Device %s is not in [aicore, aicpu].", device)
        return None
    logging.debug("op_build_to_func for ", opnames)

    polyhedral = True
    dump_ir = os.getenv(get_dump_ir_flag()) == "on"

    try:
        tmp_outputs = [x.op for x in computes]
        s = akg.tvm.create_schedule(tmp_outputs)
        if custom_schedule:
            polyhedral = False
            custom_schedule(s)

        with akg.build_config(add_lower_pass=debug_mode(0),
                              dump_pass_ir=dump_ir):
            if attrs:
                binds = attrs.pop(BINDS, None)
                rst = akg.build_to_func(s,
                                        args,
                                        name=kernel_name,
                                        attrs=attrs,
                                        polyhedral=polyhedral,
                                        binds=binds,
                                        target=_get_target(device))
            else:
                rst = akg.build_to_func(s,
                                        args,
                                        name=kernel_name,
                                        polyhedral=polyhedral,
                                        target=_get_target(device))

    except Exception:
        logging.error(traceback.format_exc())
        return None
    return rst
Пример #18
0
def focalloss_ad_run2(shape, dtype, attrs):
    logits_pld = akg.tvm.placeholder(shape, dtype=dtype, name='logits')
    labels_pld = akg.tvm.placeholder(shape, dtype='int32', name='labels')
    d_labels, d_logits, head = focalloss_ad.focalloss_ad(labels_pld, logits_pld)
    print("autodiff d_logits:\n", akg.tvm.PrintTensorRecursively(d_logits))
    print("autodiff d_labels:\n", akg.tvm.PrintTensorRecursively(d_labels))

    # build autodiff kernels
    io = [labels_pld, logits_pld, head, d_labels, d_logits]
    s = akg.tvm.create_schedule([e.op for e in io])
    kernel_name = utils.gen_name_kernel("focalloss_ad", dtype, (shape[0], shape[1],))
    with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, io, "cce", name=kernel_name, attrs=attrs, polyhedral=True)

    labels_np = RANGEFILL((batchsize,))
    logits_np = RANGEFILL((batchsize,), 2)
    head_np = RANGEFILL((batchsize,), 2)
    output = np.full(expect.shape, np.nan, dtype)
    output = utils.mod_launch(mod, (labels_np, logits_np, head_np, output), expect=output)
    expect = output  # hack

    return (input_np, head_np), output, expect, compare_tensor(output, expect, atol=0.1)
Пример #19
0
def logsoftmax_ad(shape, dtype, axis, kernel_name, attrs):
    """Compute the gradient of logsoftmax by autodiff."""
    check_list = ["float16"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "logsoftmax test only support %s while dtype is %s" %
            (",".join(check_list), dtype))
    # check_shape(shape)
    if axis < 0:
        axis = len(shape) + axis
    if axis >= len(shape):
        raise RuntimeError("axis should be less than dimension")
    if axis != len(shape) - 1:
        raise RuntimeError("Only support the last axis currently")

    shape_new = [shape[-2], shape[-1]]
    if len(shape) > 2:
        for i in range(len(shape) - 2):
            shape_new[0] = shape_new[0] * shape[i]
    shape = shape_new

    a_up = akg.tvm.placeholder(shape, dtype=dtype, name="input")
    b_up = logsoftmax.logsoftmax_op(a_up, shape, axis)

    head = akg.tvm.placeholder(b_up.shape, name="head", dtype=dtype)
    _jacs = list(akg.differentiate(b_up, [a_up], head))
    sjac = akg.tvm.create_schedule([_jacs[0].op])
    sjac[_jacs[0].op.input_tensors[1]].compute_inline()
    op_vars = [head, a_up, _jacs[0]]

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(sjac,
                        op_vars,
                        "cce",
                        name="test2",
                        attrs=attrs,
                        polyhedral=True)
        return mod
Пример #20
0
def range_run(start, limit, delta, dtype, attrs):
    t_range = tvm_range.range_value(start, limit, delta, dtype)
    # Create module
    sch = akg.tvm.create_schedule(t_range.op)
    kernel_name = "range"
    with akg.build_config(add_lower_pass=utils.debug_mode(0),
                          dump_pass_ir=True):
        mod = akg.build(sch, [t_range],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
        print(mod.imported_modules[0].get_source())
    # Generate data for testing the op
    expect = np.asarray(list(range(start, limit, delta)))

    output = np.full((max(0, (limit - start) / delta), ), np.nan, dtype)
    output = utils.mod_launch(mod, (output, ), expect=expect)

    return tuple(), output, expect, compare_tensor(output,
                                                   expect,
                                                   rtol=5e-03,
                                                   equal_nan=True)
Пример #21
0
def vector_matmul(data_m, data_n, data_k, trans_a, trans_b, dtype, kernel_name, attrs):
    check_list = ["float16", "float32"]
    if not dtype in check_list:
        raise TypeError("softmax test only support %s while dtype is %s" % (",".join(check_list), dtype))

    m = data_m
    n = data_n
    k = data_k
    data_shape, weight_shape = get_shape(m, n, k, trans_a, trans_b)
    output_shape = (m, n)

    A = akg.tvm.placeholder(data_shape, name='A', dtype=dtype)
    B = akg.tvm.placeholder(weight_shape, name='B', dtype=dtype)

    ZERO = akg.tvm.const(0.0, dtype=dtype)

    @script
    def matmul_hybrid_f_f(a, b, zero):
        t_1 = allocate((m, k, n), a.dtype, 'local')
        t_2 = output_tensor((m, n), a.dtype)
        for i_m in range(0, m):
            for i_k in range(0, k):
                for i_n in range(0, n):
                    t_1[i_m, i_k, i_n] = a[i_m, i_k] * b[i_k, i_n]
            for i1_n in range(0, n):
                t_2[i_m, i1_n] = zero
            for i1_k in range(0, k):
                for i1_n in range(0, n):
                    t_2[i_m, i1_n] = t_2[i_m, i1_n] + t_1[i_m, i1_k, i1_n]
        return t_2

    @script
    def matmul_hybrid_f_t(a, b, zero):
        t_1 = allocate((m, n, k), a.dtype, 'local')
        t_2 = output_tensor((m, n), a.dtype)
        for i_m in range(0, m):
            for i_n in range(0, n):
                t_2[i_m, i_n] = zero
                for i_k in range(0, k):
                    t_1[i_m, i_n, i_k] = a[i_m, i_k] * b[i_n, i_k]
                    t_2[i_m, i_n] = t_1[i_m, i_n, i_k] + t_2[i_m, i_n]
        return t_2

    @script
    def matmul_hybrid_t_f(a, b, zero):
        t_1 = allocate((m, k, n), a.dtype, 'local')
        t_2 = output_tensor((m, n), a.dtype)
        for i_m in range(0, m):
            for i_k in range(0, k):
                for i_n in range(0, n):
                    t_1[i_m, i_k, i_n] = a[i_k, i_m] * b[i_k, i_n]
            for i1_n in range(0, n):
                t_2[i_m, i1_n] = zero
            for i1_k in range(0, k):
                for i1_n in range(0, n):
                    t_2[i_m, i1_n] = t_2[i_m, i1_n] + t_1[i_m, i1_k, i1_n]
        return t_2

    C = ()

    if trans_a == False and trans_b == False:
        C = matmul_hybrid_f_f(A, B, ZERO)
    elif trans_a == False and trans_b == True:
        C = matmul_hybrid_f_t(A, B, ZERO)
    elif trans_a == True and trans_b == False:
        C = matmul_hybrid_t_f(A, B, ZERO)
    else:
        raise ValueError('Not support both transpose yet')

    forward_s = akg.tvm.create_schedule(C.op)
    op_vars = [A, B, C]

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(forward_s, op_vars, "cce", name=kernel_name, attrs=attrs, polyhedral=True)
        source_code = mod.imported_modules[0].get_source()
        create_code(kernel_name, "./", source_code)
        return mod, output_shape
Пример #22
0
def group_conv_ad(_n, _h, _w, _c_i, _c_o, group, _k_h, _k_w, pad_h, pad_w, _s_h, _s_w,
                  cut_h, cut_co, cut_m, cut_k, cut_n, block_size, use_bias=False, kernel_name='group_conv'):
    conv_dtype = 'float16'
    _a = akg.tvm.placeholder((_n, _c_i // block_size, _h, _w, block_size), name="input0", dtype=conv_dtype)
    _b = akg.tvm.placeholder(((_c_i // group) // block_size * _k_h * _k_w, _c_o // block_size, block_size, block_size),
                             name="input1", dtype=conv_dtype)

    mod_forward = group_conv_forward(_n, _h, _w, _c_i, _c_o, group, _k_h, _k_w, _a, _b, None,
                                     pad_h, pad_w, _s_h, _s_w, cut_h, cut_co, cut_m, cut_k, cut_n, block_size)
    _o_h = mod_forward.shape[2].value
    _o_w = mod_forward.shape[3].value


    head = akg.tvm.placeholder(mod_forward.shape, name="head", dtype=conv_dtype)
    # (_n,_c_o,_o_h,_o_w)--(stride)-->(_n,_c_o,(_o_h-1)*_s_h+1,
    # (_o_w-1)*_s_w+1)--(5d)-->(_n,_c_o/16,(_o_h-1)*_s_h+1,(_o_w-1)*_s_w+1,16)
    pld_head_strided = akg.tvm.placeholder((_n, _c_o // block_size, (_o_h - 1) * _s_h + 1, (_o_w - 1) * _s_w + 1, block_size),
                                       name="head_strided_5d", dtype=conv_dtype)

    # (_c_o,_c_i//group,_k_h,_k_w)--(flip)-->
    # (_c_i,_c_o//group,_k_h,_k_w)--(Fractal)-->((_c_o//group)/16*_k_h*_k_w, _c_i/16,16,16)
    pld_b_flipped = akg.tvm.placeholder(((_c_o // group) // block_size * _k_h * _k_w, _c_i // block_size, block_size, block_size),
                                    name="b_flip", dtype=conv_dtype)

    # b in Fractal format; result in Fractal format
    b_group_flipped = group_flip_weight(_b, _k_h, _k_w, group, _c_o // group // block_size, _c_i // group // block_size, block_size)
    s_gr_fl = akg.tvm.create_schedule([b_group_flipped.op])
    info = dim.Dim()
    info.setdim(index=0, axis=0, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=1, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=2, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=3, tilel1=1, tilel0=1)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=False):
        mod_b_group_flip = akg.build(s_gr_fl, [_b, b_group_flipped], "cce", name="b_group_flip",
                                    attrs={"dim": str(info)}, polyhedral=True)

    head_strided = strided_head(head, _s_h, _s_w)
    s_striding = akg.tvm.create_schedule(head_strided.op)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=False):
        mod_head_strided = akg.build(s_striding, [head, head_strided], "cce", name="h_strided",
                                    attrs={"dim": str(info)}, polyhedral=True)


    a_transposed = transpose_regroup(_a, block_size, group)
    s_transposed_nc = akg.tvm.create_schedule(a_transposed.op)
    info = dim.Dim()
    info.setdim(index=0, axis=0, tilel1=16, tilel0=16)
    info.setdim(index=0, axis=1, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=2, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=3, tilel1=1, tilel0=1)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_transposed_nc = akg.build(s_transposed_nc, [_a, a_transposed], "cce", name="a_transposed",
                                     attrs={"dim": str(info)}, polyhedral=True)

    head_transposed_convert = transpose_convert_head(head, block_size)
    s_transposed_convert = akg.tvm.create_schedule(head_transposed_convert.op)
    info = dim.Dim()
    info.setdim(index=0, axis=0, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=1, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=2, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=3, tilel1=1, tilel0=1)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_transposed_convert = akg.build(s_transposed_convert, [head, head_transposed_convert], "cce",
                                           name="a_transposed", attrs={"dim": str(info)}, polyhedral=True)


    # Begin with the ad kernels
    ad_attrs = {"ad_conv_enable": 1}
    _jacs_data = list(akg.differentiate(mod_forward, [_a], head, ad_attrs, [pld_head_strided, pld_b_flipped, None]))

    cut_h_e, cut_co_e, cut_m_e, cut_k_e, cut_n_e = ((_o_h - 1) * _s_h + 1 + 2 * (_k_h - 1 - pad_h), 16, _h * _w, 48, 16)
    cut_m_e = ((cut_m_e + block_size - 1) // block_size) * block_size

    info = set_dims_group(cut_h_e, cut_co_e, cut_m_e, cut_k_e, cut_n_e,
                          expr_to_int(_a.shape), _c_o, _c_i, group, _k_h, _k_w, _s_h, block_size)

    s_data = akg.tvm.create_schedule([_jacs_data[0].op])
    # low_data = akg.lower(s_data, [pld_head_strided, pld_b_flipped, _jacs_data[0]], simple_mode=True)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=False):
        mod_ad_data = akg.build(s_data, [pld_head_strided, pld_b_flipped, _jacs_data[0]], "cce",
                                name="conv_ad_data", attrs={"dim": info}, polyhedral=True)

    # (_n,_c_i,_h,_w)--(trans)-->(_c_i,_n,_h,_w)--(regroup)-->
    # (_c_i//group,_n*group,_h,_w)--(5d)-->(_c_i//group,(_n*group)/16,_h,_w,16)
    pld_x_trans = akg.tvm.placeholder((_c_i // group, (_n * group) // block_size, _h, _w, block_size),
                                      name="x_trans_5d", dtype=conv_dtype)

    # (_n,_c_o,_o_h,_o_w)--(trans)-->
    # (_c_o,_n,_o_h,_o_w)--(Fractal)-->(_n/16*_o_h*_o_w, _c_o/16,16,16)
    pld_head_trans_converted = akg.tvm.placeholder((_n // block_size * _o_h * _o_w, _c_o // block_size, block_size, block_size),
                                                   name="head_trans_convert", dtype=conv_dtype)

    # ad_attrs = {"ad_conv_enable": 1}
    _jacs_weights = list(akg.differentiate(mod_forward, [_b], head, ad_attrs,
                                           [pld_x_trans, pld_head_trans_converted, None]))

    cut_h_e, cut_co_e, cut_m_e, cut_k_e, cut_n_e = (_h + 2 * pad_h, 16, _k_h * _k_w, 48, 16)
    cut_m_e = ((cut_m_e + block_size - 1) // block_size) * block_size

    info = set_dims_group(cut_h_e, cut_co_e, cut_m_e, cut_k_e, cut_n_e,
                          (_c_i // group, _c_o // block_size, _k_h, _k_w, block_size),
                          _n * group, _c_o, group, _o_h, _o_w, 1, block_size)

    s_weights = akg.tvm.create_schedule([_jacs_weights[0].op])

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_ad_weights = akg.build(s_weights, [pld_x_trans, pld_head_trans_converted, _jacs_weights[0]], "cce",
                                   name="conv_ad_weights", attrs={"dim": info}, polyhedral=True)


    print("Forward input data shape: ", _a.shape)
    print("Forward input weight shape: ", _b.shape)
    print("Forward output shape: ", mod_forward.shape)
    print("Backward wrt. DATA input data shape: ", pld_head_strided.shape)
    print("Backward wrt. DATA input weight shape: ", pld_b_flipped.shape)
    print("Backward wrt. DATA output shape: ", _jacs_data[0].shape)
    print("Backward wrt. WEIGHT input data shape: ", pld_x_trans.shape)
    print("Backward wrt. WEIGHT input weight shape: ", pld_head_trans_converted.shape)
    print("Backward wrt. WEIGHT output shape: ", _jacs_weights[0].shape)

    return mod_ad_data, mod_ad_weights, mod_b_group_flip, mod_head_strided, mod_transposed_nc, mod_transposed_convert
Пример #23
0
def conv_01(fmap_shape,
            filter_shape,
            pad_,
            stride_,
            dilation_,
            tile_hh=0,
            tile_coco=0,
            tile_mm=0,
            tile_kk=0,
            tile_nn=0,
            use_bias=False,
            block_size=16,
            conv_dtype='float16'):

    # input shape (NCHW -> NC1HWC0)
    in_n, in_c, in_h, in_w = fmap_shape
    in_c = (in_c + block_size - 1) // block_size * block_size
    # kernel shape (NCHW -> NC1HWC0 -> Fractal)
    k_n, k_c, k_h, k_w = filter_shape
    k_c = (k_c + block_size - 1) // block_size * block_size
    k_n = (k_n + block_size - 1) // block_size * block_size

    input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size)

    kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size)
    k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0
    kernel_shape_fractal = (k_c // block_size * k_h * k_w, k_n // block_size,
                            block_size, block_size)

    # A placeholder (NC1HWCO)
    A = akg.tvm.placeholder(input_shape_nc1hwc0,
                            dtype=conv_dtype,
                            name="input0")
    # B_placeholder (fractal)
    B = akg.tvm.placeholder(kernel_shape_fractal,
                            dtype=conv_dtype,
                            name="input1")
    data = [A, B]
    if use_bias:
        bias_shape_nc1hwc0 = (1, k_n // block_size, 1, 1, block_size)
        bias_name = "input2"
        bias_value = akg.tvm.placeholder(bias_shape_nc1hwc0,
                                         dtype=conv_dtype,
                                         name=bias_name)
        data.append(bias_value)
    else:
        bias_name = 'None'
        bias_value = None

    conv, _ = Conv(data, fmap_shape, filter_shape, pad_, stride_, dilation_,
                   use_bias)

    kernel_name = 'conv_ad'

    k_n, k_c, k_h, k_w = filter_shape
    k_c = (k_c + block_size - 1) // block_size * block_size
    k_n = (k_n + block_size - 1) // block_size * block_size
    k_hw = k_h * k_w
    const_shift = k_hw - 1

    # B in Fractal format; result in Fractal format
    def flip_weight(B, k_c, k_hw, const_shift):
        out_shape = (B.shape[1].value * k_hw, k_c // block_size, block_size,
                     block_size)
        B_flip = akg.tvm.compute(
            out_shape,
            lambda i0, i1, i2, i3: B[i1 * k_hw + const_shift - truncmod(
                i0, k_hw),
                                     floordiv(i0, k_hw), i3, i2],
            name=B.name + "_flipped")
        return B_flip

    def strided_head(H, s_h, s_w):
        n, c1, h, w, c0 = H.shape
        out_shape = (n, c1, (h - 1) * s_h + 1, (w - 1) * s_w + 1, c0)
        H_strided = akg.tvm.compute(
            out_shape,
            lambda i0, i1, i2, i3, i4: akg.tvm.expr.Select(
                akg.tvm.any(truncmod(i2, s_h) != 0,
                            truncmod(i3, s_w) != 0),
                akg.tvm.const(0.0, dtype="float16"), H[i0, i1,
                                                       floordiv(i2, s_h),
                                                       floordiv(i3, s_w), i4]),
            name=H.name + "_strided")
        return H_strided

    B_flip = flip_weight(B, k_c, k_hw, const_shift)

    pld_B_flip = akg.tvm.placeholder(B_flip.shape,
                                     name="inp1_flipped",
                                     dtype='float16')
    HEAD = akg.tvm.placeholder(conv.shape, name="Head", dtype='float16')

    HEAD_n, HEAD_c1, HEAD_h, HEAD_w, HEAD_c0 = HEAD.shape
    info = set_dims((HEAD_n.value, HEAD_c1.value * HEAD_c0.value, HEAD_h.value,
                     HEAD_w.value), (k_c, k_n, k_h, k_w), (2, 2), (1, 1),
                    (1, 1), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn,
                    block_size)

    s_h, s_w = stride_
    if (s_h == 1) and (s_w == 1):
        ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1}
        jacs = list(
            akg.differentiate(conv, [A], HEAD, ad_attrs,
                              [HEAD, pld_B_flip, None]))
        sjac = akg.tvm.create_schedule([jacs[0].op])
        op_vars = [HEAD, pld_B_flip, jacs[0]]
        info = set_dims((HEAD_n.value, HEAD_c1.value * HEAD_c0.value,
                         HEAD_h.value, HEAD_w.value), (k_c, k_n, k_h, k_w),
                        (k_h - 1, k_w - 1), (1, 1), (1, 1), tile_hh, tile_coco,
                        tile_mm, tile_kk, tile_nn, block_size)
    else:
        Head_strided = strided_head(HEAD, s_h, s_w)
        pld_Head_strided = akg.tvm.placeholder(Head_strided.shape,
                                               name="head_strided",
                                               dtype='float16')

        ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1}
        jacs = list(
            akg.differentiate(conv, [A], HEAD, ad_attrs,
                              [pld_Head_strided, pld_B_flip, None]))
        sjac = akg.tvm.create_schedule([jacs[0].op])
        op_vars = [pld_Head_strided, pld_B_flip, jacs[0]]
        h_n, h_c1, h_h, h_w, h_c0 = pld_Head_strided.shape
        info = set_dims(
            (h_n.value, h_c1.value * h_c0.value, h_h.value, h_w.value),
            (k_c, k_n, k_h, k_w), (k_h - 1, k_w - 1), (1, 1), (1, 1), tile_hh,
            tile_coco, tile_mm, tile_kk, tile_nn, block_size)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_backward = akg.build(sjac,
                                 op_vars,
                                 "cce",
                                 name=kernel_name,
                                 attrs={"dim": str(info)},
                                 polyhedral=True)

    def transpose_data(A):
        out_shape = (A.shape[1] * block_size, truncdiv(A.shape[0], block_size),
                     A.shape[2], A.shape[3], block_size)
        A_transpose = akg.tvm.compute(
            out_shape,
            lambda j0, j1, j2, j3, j4: A[j1 * block_size + j4,
                                         truncdiv(j0, block_size), j2, j3,
                                         truncmod(j0, block_size)],
            name=A.name + "_transposed")
        return A_transpose

    # Head is in 5D format
    # Output is in Fractal format
    def transpose_convert_head(Head):
        out_shape = ((floordiv(Head.shape[0].value, block_size)) *
                     Head.shape[2].value * Head.shape[3].value,
                     Head.shape[1].value, block_size, block_size)
        tmp_6D_shape = (floordiv(Head.shape[0].value,
                                 block_size), block_size, Head.shape[1].value,
                        Head.shape[2].value, Head.shape[3].value, block_size)

        Head_6D = akg.topi.reshape(Head, tmp_6D_shape)
        # Transpose from (N//block_size_N, block_size_N, C//block_size_C, H, W, block_size_C)
        #           to   (N//block_size_N, H, W, C//block_size_C, block_size_C, block_size_N,)
        Head_6D_transpose = akg.topi.transpose(Head_6D, (0, 3, 4, 2, 5, 1))
        Head_transpose_convert = akg.topi.reshape(Head_6D_transpose, out_shape)
        return Head_transpose_convert

    X_transposed = transpose_data(A)
    pld_X_transposed = akg.tvm.placeholder(X_transposed.shape,
                                           name="inp0_transposed",
                                           dtype='float16')

    if (s_h > 1) or (s_w > 1):
        Head_transposed_converted = strided_head(HEAD, s_h, s_w)
    else:
        Head_transposed_converted = HEAD

    strided_head_n, strided_head_c1, strided_head_h, strided_head_w, strided_head_c0 = Head_transposed_converted.shape
    Head_transposed_converted = transpose_convert_head(
        Head_transposed_converted)

    _ = akg.tvm.create_schedule(Head_transposed_converted.op)

    pld_Head_transposed_converted = akg.tvm.placeholder(
        Head_transposed_converted.shape,
        name="head_transposed",
        dtype='float16')
    ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1}
    jacs = list(
        akg.differentiate(
            conv, [B], HEAD, ad_attrs,
            [pld_X_transposed, pld_Head_transposed_converted, None]))
    sjac = akg.tvm.create_schedule([jacs[0].op])

    op_vars = [HEAD, pld_X_transposed, pld_Head_transposed_converted, jacs[0]]
    in_n, in_c1, in_h, in_w, in_c0 = A.shape
    info = set_dims(
        (in_c1.value * in_c0.value, in_n.value, in_h.value, in_w.value),
        (strided_head_c1.value * strided_head_c0.value, strided_head_n.value,
         strided_head_h.value, strided_head_w.value), (0, 0), (1, 1), (1, 1),
        tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_backward2 = akg.build(sjac,
                                  op_vars,
                                  "cce",
                                  name="conv_backward_weight",
                                  attrs={"dim": str(info)},
                                  polyhedral=True)

    return mod_backward, mod_backward2
Пример #24
0
def psroialign_compute(fm_shape, roi_shape, class_num, group_size, sample_h,
                       sample_w, scale):
    '''

    :param fm_shape:   (n, c_dim, h, w) where: c_dim = group_size * group_size * (class_num + 1)
    :param roi_shape:  (roi_num, 16, 1, 1). there are 5 value on dim C: score, x1, y1, x2, y2. The other 11 num is pads
    :param class_num:
    :param group_size:
    :param sample_h:
    :param sample_w:
    :param scale:
    :return:

    '''

    dtype = "float16"

    fm_data = akg.tvm.placeholder(fm_shape, name="fm_data", dtype=dtype)
    roi_data = akg.tvm.placeholder(roi_shape, name="roi_data", dtype=dtype)
    scale_const = akg.tvm.const(scale, dtype=dtype)

    sample_h_const = akg.tvm.const(sample_h, "int32")
    sample_w_const = akg.tvm.const(sample_w, "int32")
    two_const = akg.tvm.const(2, "float16")
    one_const = akg.tvm.const(1, "float16")
    group_size_const = akg.tvm.const(group_size, "int32")

    bin_num = group_size * group_size

    # ==============================================================
    # step 1: scale coordinates size in original image to size in feature map
    # ==============================================================

    COSIZE = 16
    roi_num = roi_shape[0]
    aligned_roi_num = do_align(roi_num, COSIZE)

    # 4 means x1, y1, x2, y2
    # roi_shape[0] must be equal to COSIZE
    scaled_coors = akg.tvm.compute(
        (4, aligned_roi_num, 1, 1),
        lambda n, c, h, w: roi_data[c, 1 + n, h, w] * scale_const,
        name='scaled_coors')

    # ==============================================================
    # step 2: compute the width and height of roi
    # ==============================================================

    # 2 stands for width and height
    width_height_shape = (2, aligned_roi_num, 1, 1)
    width_height_of_rois = akg.tvm.compute(
        width_height_shape,
        lambda n, c, h, w: scaled_coors[n + 2, c, h, w] - scaled_coors[n, c, h,
                                                                       w],
        name='width_height_of_rois')

    width_shape = (aligned_roi_num, )
    width_of_rois = akg.tvm.compute(
        width_shape,
        lambda n: scaled_coors[2, n, 0, 0] - scaled_coors[0, n, 0, 0],
        name='width_of_rois')
    width_shape = (aligned_roi_num, )
    height_of_rois = akg.tvm.compute(
        width_shape,
        lambda n: scaled_coors[1, n, 0, 0] - scaled_coors[3, n, 0, 0],
        name='height_of_rois')

    # ==============================================================
    # step 3: compute the bias of the coordinates of all samples
    # ==============================================================

    # samples_shape = (aligned_roi_num, bin_num, sample_h, sample_w)

    # unit_nums = akg.tvm.compute((2,), lambda i: two_const * group_size_const \
    #                                         * akg.tvm.expr.Select(i == 0, sample_w_const, sample_h_const), name = 'uint_nums')

    # width_height_shape(0, x, x, x) indicates the width of a single unit which is separated by samples
    # and width_height_shape(1, x, x, x) the height
    # unit_lengths = akg.tvm.compute(width_height_shape, lambda n, c, h, w: width_height_of_rois(n, c, h, w) / unit_nums(n), \
    #                            name = 'uint_lengths')

    unit_w_lengths = akg.tvm.compute(
        width_shape,
        lambda n: width_of_rois(n) / sample_w_const * group_size_const,
        name='uint_w_lengths')
    unit_h_lengths = akg.tvm.compute(
        width_shape,
        lambda n: height_of_rois(n) / sample_h_const * group_size_const,
        name='uint_h_lengths')

    # samples_coors_x_shape = (aligned_roi_num, 1, group_size * sample_h, group_size * sample_w)
    # samples_x_coors_bias = akg.tvm.compute(samples_coors_x_shape, lambda n, c, h, w: unit_w_lengths[n] * \
    #                                         (one_const + w * two_const), name = 'samples_x_coors_bias')
    #
    # samples_y_coors_bias = akg.tvm.compute(samples_coors_x_shape, lambda n, c, h, w: unit_h_lengths[n] * \
    #                                         (one_const + w * two_const), name = 'samples_y_coors_bias')
    #
    # samples_x_coors = akg.tvm.compute(samples_coors_x_shape, lambda n, c, h, w: \
    #     samples_x_coors_bias(n, c, h, w) + scaled_coors(1, c, 1, 1), name = 'samples_x_coors')
    # samples_y_coors = akg.tvm.compute(samples_coors_x_shape, lambda n, c, h, w: \
    #     samples_y_coors_bias(n, c, h, w) + scaled_coors(2, c, 1, 1), name = 'samples_y_coors')

    sample_w_bias_shape = (1, group_size, sample_w, aligned_roi_num)
    # sample_w_bias = akg.tvm.compute(sample_w_bias_shape, lambda n, c, h, w: unit_w_lengths[w] * \
    #                                 (one_const + two_const * (c * sample_w_const + h)), name = 'samples_w_bias')
    # sample_w_bias = akg.tvm.compute(sample_w_bias_shape, lambda n, c, h, w: unit_w_lengths[w] * \
    #                                   (one_const + two_const * (sample_w_const)), name = 'samples_w_bias')

    sample_h_bias_shape = (1, group_size, sample_h, aligned_roi_num)
    # sample_h_bias = akg.tvm.compute(sample_h_bias_shape, lambda n, c, h, w: unit_h_lengths[w] * \
    #                                 (one_const + two_const * (c * sample_h_const + h)), name = 'samples_h_bias')
    # sample_h_bias = akg.tvm.compute(sample_h_bias_shape, lambda n, c, h, w: unit_h_lengths[w] * \
    #                                   (one_const + two_const * (sample_h_const)), name = 'samples_h_bias')

    @akg.tvm.hybrid.script(capture=locals())
    def gen_bias(h_value, unit_lengths, ratio):
        output = output_tensor((1, group_size, h_value, aligned_roi_num),
                               'float16')

        strides = allocate((aligned_roi_num, ), 'float16', 'local')
        for w in range(0, aligned_roi_num):
            strides[w] = half(0.0)

        for c in range(0, group_size):
            for h in range(0, 1):
                for w in range(0, aligned_roi_num):
                    output[0, c, h, w] = unit_lengths[w]
                    # strides[w] += unit_lengths[w] * ratio * half(h_value)

            for h in range(1, h_value):
                for w in range(0, aligned_roi_num):
                    output[0, c, h, w] = output[0, c, h - 1,
                                                w] + ratio * unit_lengths[w]

        return output

    sample_w_bias = gen_bias(sample_w_const, unit_w_lengths, two_const)
    sample_h_bias = gen_bias(sample_h_const, unit_h_lengths, two_const)

    samples_x_coors = akg.tvm.compute(
        sample_w_bias_shape,
        lambda n, c, h, w: sample_w_bias(n, c, h, w) + scaled_coors(
            0, w, 0, 0),
        name='samples_x_coors')

    samples_y_coors = akg.tvm.compute(
        sample_h_bias_shape,
        lambda n, c, h, w: sample_h_bias(n, c, h, w) + scaled_coors(
            1, w, 0, 0),
        name='samples_y_coors')

    # ==============================================================
    # step 4: compute the low and high coordinates of samples for bilinear
    # ==============================================================
    # samples_x_coors_low = akg.tvm.compute(sample_w_bias_shape, lambda *indices: \
    #     akg.lang.ascend.floor(samples_x_coors(*indices)), name = 'samples_x_coors_low')
    # samples_x_coors_high = akg.tvm.compute(sample_w_bias_shape, lambda *indices: \
    #     akg.lang.ascend.ceil(samples_x_coors(*indices)), name = 'samples_x_coors_high')
    # samples_y_coors_low = akg.tvm.compute(sample_h_bias_shape, lambda *indices: \
    #     akg.lang.ascend.floor(samples_y_coors(*indices)), name = 'samples_y_coors_low')
    # samples_y_coors_high = akg.tvm.compute(sample_h_bias_shape, lambda *indices: \
    #     akg.lang.ascend.ceil(samples_y_coors(*indices)), name = 'samples_y_coors_high')
    samples_x_coors_low = akg.lang.ascend.floor(samples_x_coors)
    samples_x_coors_high = akg.lang.ascend.ceil(samples_x_coors)
    samples_y_coors_low = akg.lang.ascend.floor(samples_y_coors)
    samples_y_coors_high = akg.lang.ascend.ceil(samples_y_coors)

    # samples_x_coors_low = akg.tvm.compute(sample_w_bias_shape, lambda *indices: \
    #     akg.topi.cast(samples_x_coors(*indices), 'int32'), name = 'samples_x_coors_low')
    # samples_x_coors_high = akg.tvm.compute(sample_w_bias_shape, lambda *indices: \
    #     samples_x_coors_low(*indices) + akg.topi.cast(one_const, 'int32'), name = 'samples_x_coors_high')
    # samples_y_coors_low = akg.tvm.compute(sample_h_bias_shape, lambda *indices: \
    #     akg.topi.cast(samples_y_coors(*indices), 'int32'), name = 'samples_y_coors_low')
    # samples_y_coors_high = akg.tvm.compute(sample_h_bias_shape, lambda *indices: \
    #     samples_y_coors_low(*indices) + akg.topi.cast(one_const, 'int32'), name = 'samples_y_coors_high')

    # ==============================================================
    # step 5: compute the weight of low and high coordinates for bilinear
    # ==============================================================
    # wlx = akg.tvm.compute(samples_coors_x_shape, lambda *indices: samples_x_coors_high(*indices) - samples_x_coors(*indices))
    # whx = akg.tvm.compute(samples_coors_x_shape, lambda *indices: one_const - wlx(*indices))
    #
    # wly = akg.tvm.compute(samples_coors_x_shape, lambda *indices: samples_y_coors_high(*indices) - samples_y_coors(*indices))
    # why = akg.tvm.compute(samples_coors_x_shape, lambda *indices: one_const - wly(*indices))
    #
    # wlxXwly = akg.tvm.compute(samples_coors_x_shape, lambda *indices: wlx(*indices) * wly(*indices))
    # whxXwly = akg.tvm.compute(samples_coors_x_shape, lambda *indices: whx(*indices) * wly(*indices))
    # wlxXwhy = akg.tvm.compute(samples_coors_x_shape, lambda *indices: wlx(*indices) * why(*indices))
    # whxXwhy = akg.tvm.compute(samples_coors_x_shape, lambda *indices: whx(*indices) * why(*indices))

    wlx = akg.tvm.compute(sample_w_bias_shape,
                          lambda *indices: samples_x_coors_high(*indices) -
                          samples_x_coors(*indices),
                          name='wlx')
    whx = akg.tvm.compute(sample_w_bias_shape,
                          lambda *indices: one_const - wlx(*indices),
                          name='whx')

    wly = akg.tvm.compute(sample_h_bias_shape,
                          lambda *indices: samples_y_coors_high(*indices) -
                          samples_y_coors(*indices),
                          name='wly')
    why = akg.tvm.compute(sample_h_bias_shape,
                          lambda *indices: one_const - wly(*indices),
                          name='why')

    samples_shape = (group_size, group_size, sample_h, sample_w,
                     aligned_roi_num)
    wlxXwly = akg.tvm.compute(
        samples_shape,
        lambda i, j, m, n, k: wlx(0, j, n, k) * wly(0, i, m, k),
        name='wlxXwly')
    whxXwly = akg.tvm.compute(
        samples_shape,
        lambda i, j, m, n, k: whx(0, j, n, k) * wly(0, i, m, k),
        name='whxXwly')
    wlxXwhy = akg.tvm.compute(
        samples_shape,
        lambda i, j, m, n, k: wlx(0, j, n, k) * why(0, i, m, k),
        name='wlxXwhy')
    whxXwhy = akg.tvm.compute(
        samples_shape,
        lambda i, j, m, n, k: whx(0, j, n, k) * why(0, i, m, k),
        name='whxXwhy')

    boundaries_values_shape = (4, sample_h, sample_w, aligned_roi_num)
    bin_values_shape = (1, class_num + 1, bin_num, aligned_roi_num)
    gap_values_shape = (class_num + 1, aligned_roi_num)

    @akg.tvm.hybrid.script
    def fetch_data(shape, fm_in, c_idx, bin_idx, bin_num, group_size, sample_h,
                   sample_w, roi_num, x_low, x_high, y_low, y_high, one_value):
        boundaries_values = output_tensor(shape, 'float16')

        for i in range(0, sample_h):
            for j in range(0, sample_w):
                for k in range(0, roi_num):
                    # assume batch is 1

                    # w_low_idx =  x_low[0, bin_idx % group_size, j, k]
                    # w_high_idx =  x_high[0, bin_idx % group_size, j, k]
                    #
                    # h_low_idx =  y_low[0, bin_idx // group_size, i, k]
                    # h_high_idx =  y_high[0, bin_idx // group_size, i, k]

                    #x_low, y_low
                    boundaries_values[0, i, j, k] = one_value
                    boundaries_values[1, i, j, k] = one_value
                    boundaries_values[2, i, j, k] = one_value
                    boundaries_values[3, i, j, k] = one_value
                    # boundaries_values[0, i, j, k] = fm_in[0, c_idx * bin_num + bin_idx, h_low_idx, w_low_idx]
                    #
                    # #x_high, y_low
                    # boundaries_values[1, i, j, k] = fm_in[0, c_idx * bin_num + bin_idx, h_low_idx, w_high_idx]
                    #
                    # #x_low, y_high
                    # boundaries_values[2, i, j, k] = fm_in[0, c_idx * bin_num + bin_idx, h_high_idx, w_low_idx]
                    #
                    # #x_high, y_high
                    # boundaries_values[3, i, j, k] = fm_in[0, c_idx * bin_num + bin_idx, h_high_idx, w_high_idx]

        return boundaries_values

    @akg.tvm.hybrid.script(capture=locals())
    def compute_bilinear_maxpool_gap(fm_in, x_low, x_high, y_low, y_high,
                                     wlxXwly_, whxXwly_, wlxXwhy_, whxXwhy_,
                                     one_value):

        bin_values = allocate(bin_values_shape, 'float16', 'local')

        # global average result
        gap_values = output_tensor(gap_values_shape, 'float16')

        for c in range(0, class_num + 1):
            for b in range(0, bin_num):
                boundaries_values = fetch_data(boundaries_values_shape, fm_in,
                                               c, b, bin_num, group_size,
                                               sample_h, sample_w, roi_num,
                                               x_low, x_high, y_low, y_high,
                                               one_value)

                k_w = b % group_size
                k_h = b // group_size

                for n in range(0, roi_num):
                    bin_values[0, c, b, n] = half(0.0)

                for h in range(0, sample_h):
                    for w in range(0, sample_w):
                        for n in range(0, roi_num):
                            # bilinear
                            tmp = boundaries_values[0, h, w, n] * wlxXwly_[k_h, k_w, h, w, n] + \
                                boundaries_values[1, h, w, n] * whxXwly_[k_h, k_w, h, w, n] + \
                                boundaries_values[2, h, w, n] * wlxXwhy_[k_h, k_w, h, w, n] + \
                                boundaries_values[3, h, w, n] * whxXwhy_[k_h, k_w, h, w, n]

                            # maxpooling
                            if tmp > bin_values[0, c, b, n]:
                                bin_values[0, c, b, n] = tmp

            # global average pooling
            for j in range(0, roi_num):
                tmp1 = bin_values[0, c, 0, j]
                for k in range(1, bin_num):
                    tmp1 += bin_values[0, c, k, j]

                gap_values[c, j] = tmp1 / bin_num

        return gap_values

    # ==============================================================
    # step 6: compute results of bilinear, maxpooling and global average pooling
    # ==============================================================
    out = compute_bilinear_maxpool_gap(fm_data, samples_x_coors_low,
                                       samples_x_coors_high,
                                       samples_y_coors_low,
                                       samples_y_coors_high, wlxXwly, whxXwly,
                                       wlxXwhy, whxXwhy, one_const)

    # out = wlxXwhy

    # info = dim.Dim()
    # info.setdim(index=0, head = 0, body = 0, tail = 0, tilel1 = 1, tilel0 = 1)
    # info.setdim(index=0, head = 0, body = 0, tail = 0, tilel1 = 1, tilel0 = 1)

    s = akg.tvm.create_schedule(out.op)
    with akg.build_config(add_lower_pass=utils.debug_mode(0),
                          dump_pass_ir=True):
        # mod = akg.tvm.build(s, [fm_data, roi_data, out], "cce", name="psroialign", attrs = {"dim" : str(info)}, polyhedral=True)
        mod = akg.build(s, [fm_data, roi_data, out],
                        "cce",
                        name="psroialign",
                        polyhedral=True)

    return mod
Пример #25
0
def group_conv_forward(_n, _h, _w, _c_i, _c_o, group, _k_h, _k_w,
                       _a, _b, bias_value, pad_h, pad_w, _s_h, _s_w,
                       cut_h, cut_co, cut_m, cut_k, cut_n, block_size,
                       use_bias=False,
                       kernel_name='group_conv'):
    if (not isinstance(_n, int)):
        _n, _h, _w, _c_i, _c_o, group, _k_h, _k_w = expr_to_int((_n, _h, _w, _c_i, _c_o, group, _k_h, _k_w))
        pad_h, pad_w, _s_h, _s_w = expr_to_int((pad_h, pad_w, _s_h, _s_w))
        cut_h, cut_co, cut_m, cut_k, cut_n, block_size = expr_to_int((cut_h, cut_co, cut_m, cut_k, cut_n, block_size))

    conv_dtype = 'float16'

    if cut_h == _h:
        cut_h += pad_h + pad_h

    assert _c_o % group == 0 and _c_i % group == 0
    assert _c_o % block_size == 0 and (_c_i // group) % block_size == 0

    if (use_bias):
        bias = bias_value

    _o_h = (_h + 2 * pad_h - _k_h) // _s_h + 1
    _o_w = (_w + 2 * pad_w - _k_w) // _s_w + 1

    kc1 = akg.tvm.reduce_axis((0, _c_i // block_size // group), name='kc1')
    kh = akg.tvm.reduce_axis((0, _k_h), name='kh')
    kw = akg.tvm.reduce_axis((0, _k_w), name='kw')
    kc0 = akg.tvm.reduce_axis((0, block_size), name='kc0')

    p_top, p_bottom, p_left, p_right = pad_h, pad_h, pad_w, pad_w
    output_name = 'output'
    output_bias_name = 'output_bias'

    C = akg.tvm.compute((_n, _c_o // block_size, _o_h, _o_w, block_size),
                    lambda n, c1, h, w, c0:
                    akg.lang.ascend.mmad(
                        akg.tvm.if_then_else(
                            akg.tvm.any((h * _s_h + kh) < p_top, (h * _s_h + kh) > (_h + p_top - 1),
                                        (w * _s_w + kw) < p_left, (w * _s_w + kw) > (_w + p_left - 1)),
                            akg.tvm.const(0.0, conv_dtype),
                            _a[n, c1 // ((_c_o // block_size) // group) * ((_c_i // block_size) // group) + kc1,
                               (h * _s_h + kh - p_top), (w * _s_w + kw - p_left), kc0])
                        * _b[(kc1 * _k_h + kh) * _k_w + kw, c1, c0, kc0],
                        axis=[kc1, kh, kw, kc0]),
        attrs={
                "pragma_conv_kernel_n": _c_o,
                "pragma_conv_kernel_h": _k_h,
                "pragma_conv_kernel_w": _k_w,
                "pragma_conv_padding_top": p_top,
                "pragma_conv_padding_bottom": p_bottom,
                "pragma_conv_padding_left": p_left,
                "pragma_conv_padding_right": p_right,
                "pragma_conv_bypass_l1": 1,
                "pragma_conv_stride_h": _s_h,
                "pragma_conv_stride_w": _s_w,
                "pragma_conv_fm_n": _n,
                "pragma_conv_fm_c": _c_i,
                "pragma_conv_fm_h": _h,
                "pragma_conv_fm_w": _w,
                "pragma_conv_dilation_h": 1,
                "pragma_conv_dilation_w": 1,
                "pragma_conv_h_cut": cut_h,
                "pragma_conv_w_cut": _w + 2 * pad_w,
                "pragma_conv_co_cut": cut_co,
                "pragma_conv_m_cut": cut_m,
                "pragma_conv_k_cut": cut_k,
                "pragma_conv_n_cut": cut_n,
                "feature": _a.op.name,
                "filter": _b.op.name,
                "bias": 'bias',
                "res": output_name,
                "res_bias": output_bias_name},
        name=output_name)

    if use_bias:
        out = akg.tvm.compute(C.shape,
                              lambda n, c1, h, w, c0:
                              C[n, c1, h, w, c0] + bias[0, c1, 0, 0, c0],
                              name=output_bias_name)
        bufs = [_a, _b, bias, out]
    else:
        out = C
        bufs = [_a, _b, out]

    # create schedule for cce
    s = akg.tvm.create_schedule([out.op])

    # set dim
    info = set_dims_group(cut_h, cut_co, cut_m, cut_k, cut_n,
                          expr_to_int(out.shape), _c_i, _c_o, group,
                          _k_h, _k_w, _s_h, block_size)

    # build
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=False):
        mod = akg.build(s, bufs, "cce", name=kernel_name, attrs={"dim": info}, polyhedral=True)

    return out
Пример #26
0
def test_CCE_Conv(fmap_shape, filter_shape, pad_, stride_,
                  tile_hh=0, tile_coco=0, tile_mm=0, tile_kk=0, tile_nn=0, bypass_l1=False,
                  use_bias=False, kernel_name="quant_conv", cce_path='.'):
    # input shape (NCHW -> NC1HWC0)
    in_n, in_c, in_h, in_w = fmap_shape
    input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size)
    # out_shape_nc1hwc0 = (in_n, in_c // 32, in_h, in_w, 32)
    in_n, in_c1, in_h, in_w, in_c0 = input_shape_nc1hwc0

    # kernel shape (NCHW -> NC1HWC0 -> Fractal)
    k_n, k_c, k_h, k_w = filter_shape
    kernel_shape_nc1hwc0 = (k_n, k_c // 32, k_h, k_w, 32)
    k_n, k_c1, k_h, k_w, k_c0 = kernel_shape_nc1hwc0
    kernel_shape_fractal = (k_c // 32 * k_h * k_w, k_n // 16, 16, 32)
    f_ko, f_no, f_ni, f_ki = kernel_shape_fractal

    # bias shape
    bias_shape_nc1hwc0 = (1, k_n // block_size, 1, 1, block_size)

    # padding ((padding_h, padding_w) -> (padding_top, padding_bottom, padding_left, padding_right))
    padding = (pad_[0], pad_[0], pad_[1], pad_[1])
    p_top, p_bottom, p_left, p_right = padding

    # stride (stride_h, stride_w)
    s_h, s_w = stride_

    # A placeholder (NC1HWCO)
    A = akg.tvm.placeholder(input_shape_nc1hwc0, dtype=conv_dtype, name='FMap')
    # B_placeholder (fractal)
    B = akg.tvm.placeholder(kernel_shape_fractal, dtype='int8', name='Filter')
    ScaleQ = akg.tvm.placeholder((16,), dtype='float16', name='ScaleQ')
    OffsetQ = akg.tvm.placeholder((16,), dtype='float16', name='OffsetQ')

    out_shape_nc1hwc0 = (in_n, in_c // 32, in_h, in_w, 32)
    q_n, q_c1, q_h, q_w, q_c0 = out_shape_nc1hwc0
    # print out_shape_nc1hwc0
    Quant = akg.tvm.compute(out_shape_nc1hwc0,
                            lambda qn, qc1, qh, qw, qc0: (
                                        A[qn, qc1 + qc0 // 16, qh, qw, qc0 % 16] * ScaleQ[0] + OffsetQ[0]).astype(
                                'int8'), name='QuantOUT', attrs={'no_inline': 1})

    if use_bias:
        bias_name = 'bias'
        bias_value = akg.tvm.placeholder(bias_shape_nc1hwc0, dtype=conv_dtype, name=bias_name)
    else:
        bias_name = 'None'

    # Create reduction variables
    kc1 = akg.tvm.reduce_axis((0, k_c1), name='kc1')
    kh = akg.tvm.reduce_axis((0, k_h), name='kh')
    kw = akg.tvm.reduce_axis((0, k_w), name='kw')
    kc0 = akg.tvm.reduce_axis((0, k_c0), name='kc0')

    out_h = (in_h + p_top + p_bottom - k_h) // (s_h) + 1
    tile_out_h = (tile_hh - k_h) // s_h + 1
    out_w = (in_w + p_left + p_right - k_w) // (s_w) + 1

    out_shape_nc1hwc0 = (in_n, k_n // block_size, out_h, out_w, block_size)
    out_n, out_c1, out_h, out_w, out_c0 = out_shape_nc1hwc0

    if (tile_coco > 0):
        c1_cut = tile_coco // block_size
    else:
        c1_cut = out_c1

    # set dim
    index = 0
    info = dim.Dim()
    if (q_c1 > 1):
        info.setdim(index=index, axis="KO", tilel1=q_c1, tilel0=q_c1)  # ko
    if (q_h > 1):
        info.setdim(index=index, axis="C1", tilel1=tile_out_h, tilel0=tile_out_h)  # c1
    if (q_w > 1):
        info.setdim(index=index, axis="C0", tilel1=q_w, tilel0=q_w)  # c0
    if (q_c0 > 1):
        info.setdim(index=index, axis="KI", tilel1=q_c0, tilel0=q_c0)  # ki

    index += 1
    if (out_c1 > 1):
        info.setdim(index=index, axis="C1", tilel1=c1_cut, tilel0=0)  # c1
    if (out_h > 1):
        info.setdim(index=index, axis="H", tilel1=tile_out_h, tilel0=0)  # h
    if (out_w > 1):
        info.setdim(index=index, axis="W", tilel1=out_w, tilel0=0)  # w
    if (out_c0 > 1):
        info.setdim(index=index, axis="C0", tilel1=out_c0, tilel0=0)  # c0
    if (in_c1 > 1):
        info.setdim(index=index, axis="KC1", tilel1=in_c1 / 2, tilel0=0)  # kc1
    if (k_h > 1):
        info.setdim(index=index, axis="KH", tilel1=k_h, tilel0=0)  # kh
    if (k_w > 1):
        info.setdim(index=index, axis="KW", tilel1=k_w, tilel0=0)  # kw
    info = str(info)

    # Compute the convolution
    output_name = "output0"
    output_bias_name = "output1"

    # print out_shape_nc1hwc0
    C = akg.tvm.compute(out_shape_nc1hwc0,
                        lambda n, c1, h, w, c0: akg.tvm.sum(
                            akg.tvm.if_then_else(
                                akg.tvm.any((h * s_h + kh) < p_top, (h * s_h + kh) > (in_h + p_top - 1),
                                            (w * s_w + kw) < p_left, (w * s_w + kw) > (in_w + p_left - 1)),
                                akg.tvm.const(0.0, 'int8'),
                                Quant[n, kc1, (h * s_h + kh - p_top), (w * s_w + kw - p_left), kc0])
                            * B[(kc1 * k_h + kh) * k_w + kw, c1, c0, kc0],
                            axis=[kc1, kh, kw, kc0]), name=output_name,
                        attrs={
                            "pragma_conv_kernel_n": k_n,
                            "pragma_conv_kernel_h": k_h,
                            "pragma_conv_kernel_w": k_w,
                            "pragma_conv_padding_top": p_top,
                            "pragma_conv_padding_bottom": p_bottom,
                            "pragma_conv_padding_left": p_left,
                            "pragma_conv_padding_right": p_right,
                            "pragma_conv_dilation_h": 1,
                            "pragma_conv_dilation_w": 1,
                            "pragma_conv_bypass_l1": 1 if bypass_l1 else 0,
                            "pragma_conv_stride_h": s_h,
                            "pragma_conv_stride_w": s_w,
                            "pragma_conv_fm_n": in_n,
                            "pragma_conv_fm_c": in_c,
                            "pragma_conv_fm_h": in_h,
                            "pragma_conv_fm_w": in_w,
                            "pragma_conv_h_cut": (h_window_cut - 1) * s_h + k_h,
                            "pragma_conv_w_cut": (in_w + p_left + p_right),
                            "pragma_conv_co_cut": c1_cut * k_c0,
                            "pragma_conv_m_cut": tile_mm,
                            "pragma_conv_k_cut": tile_kk,
                            "pragma_conv_n_cut": tile_nn,
                            "feature": Quant.op.name,
                            "filter": B.op.name,
                            "bias": bias_name,
                            "res": output_name,
                            "res_bias": output_bias_name})

    if use_bias:
        cube = akg.tvm.compute(out_shape_nc1hwc0,
                               lambda n, c1, h, w, c0: C[n, c1, h, w, c0] + bias_value[0, c1, 0, 0, c0],
                               name=output_bias_name)
    else:
        cube = C

    if fusion:
        # leakly relu
        negative_slope = 0.0
        slope_tmp = akg.tvm.const(negative_slope, dtype=conv_dtype)
        # negative_slope*x
        out = akg.lang.ascend.vmuls(cube, slope_tmp)
        # max(x,negative_slope*x)
        out = akg.lang.ascend.vmax(out, cube)
    else:
        out = cube

    # schedule
    s = akg.tvm.create_schedule(out.op)
    attrs = {}
    with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True):
        if fusion:
            if use_bias:
                mod = akg.build(s, [A, B, ScaleQ, OffsetQ, bias_value, out], "cce", name=kernel_name,
                                attrs={"dim": info}, polyhedral=True)
            else:
                mod = akg.build(s, [A, B, ScaleQ, OffsetQ, out], "cce", name=kernel_name, attrs={"dim": info},
                                polyhedral=True)
        else:
            if use_bias:
                mod = akg.build(s, [A, B, ScaleQ, OffsetQ, bias_value, out], "cce", name=kernel_name,
                                attrs={"dim": info}, polyhedral=True)
            else:
                mod = akg.build(s, [A, B, ScaleQ, OffsetQ, out], "cce", name=kernel_name, attrs={"dim": info},
                                polyhedral=True)
    source_code = mod.imported_modules[0].get_source()
    # print(source_code)
    # utils.create_code(kernel_name, cce_path, source_code)
    if run_cce:
        run_conv(mod, fmap_shape, filter_shape, pad_[0], stride_[0], use_bias)
Пример #27
0
def maxpool_ad_manual_schedule_no_overlap_all_max(shape,
                                                  kernel,
                                                  stride,
                                                  pad,
                                                  dtype,
                                                  attrs=None,
                                                  polyhedral=False):
    """automatic differentiate of maxpool with manual schedule for no overlap case."""
    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    pad_h, pad_w, _, _ = pad
    batch_size, input_c1, input_h, input_w, input_c0 = shape
    pad_shape = (batch_size, input_c1, input_h + 2 * pad_h,
                 input_w + 2 * pad_w, input_c0)

    def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        in_data = inputs[0]

        if stride_w != kernel_w:
            raise RuntimeError(
                "Only supports kernels with same dimensions as stride size!")
        if stride_h != kernel_h:
            raise RuntimeError(
                "Only supports kernels with same dimensions as stride size!")

        out_broadcast = akg.tvm.compute(
            pad_shape,
            lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h),
                                        akg.tvm.floordiv(w, stride_w), c0),
            name="out_broadcast")

        # copy output to the shape of the padded input, copying the same value for the entire kernel size
        out_broadcast = akg.tvm.compute(
            pad_shape,
            lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h),
                                        akg.tvm.floordiv(w, stride_w), c0),
            name="out_broadcast")

        # copy head to the shape of the padded input, copying the same value for the entire kernel size
        head_broadcast = akg.tvm.compute(
            pad_shape,
            lambda b, c1, h, w, c0: head_(b, c1, akg.tvm.floordiv(h, stride_h),
                                          akg.tvm.floordiv(w, stride_w), c0),
            name="head_broadcast")

        # check if value was a maximum and assign head of that position if it was
        # this is done for all the maximum values within one kernel
        result = akg.tvm.compute(
            in_data.shape,
            lambda b, c1, h, w, c0: akg.tvm.expr.Select(
                in_data(b, c1, h, w, c0) == out_broadcast(
                    b, c1, h + pad_h, w + pad_w, c0),
                head_broadcast(b, c1, h + pad_h, w + pad_w, c0),
                akg.tvm.const(0, dtype=in_data.dtype)),
            name="result")
        return [result]

    out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
    out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1

    out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0)

    # tensor for the input data
    data = akg.tvm.placeholder(shape, dtype, name="input_data")

    # maxpool output
    forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype)

    # adjoint tensor for the differentiation
    head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype)

    # override differentiation computation with custom function
    [dl_ddata
     ] = akg.differentiate(forward, [data],
                           head,
                           None,
                           None,
                           override={forward: ([data], custom_maxpool_fdiff)})

    # schedule for differetiation operation
    s = akg.tvm.create_schedule([dl_ddata.op])

    # get computations
    result = dl_ddata
    forward_broadcast = result.op.input_tensors[1]
    head_broadcast = result.op.input_tensors[2]

    # cache reads and writes
    result_ub = s.cache_write(result, "local.UB")
    data_ub = s.cache_read(data, "local.UB", [result_ub])
    head_ub = s.cache_read(head, "local.UB", [head_broadcast])
    forward_ub = s.cache_read(forward, "local.UB", [forward_broadcast])

    s[head_broadcast].set_scope("local.UB")
    s[forward_broadcast].set_scope("local.UB")

    s[head_ub].compute_at(s[head_broadcast], head_broadcast.op.axis[0])
    s[forward_ub].compute_at(s[forward_broadcast],
                             forward_broadcast.op.axis[0])
    s[data_ub].compute_at(s[result_ub], result_ub.op.axis[0])
    s[forward_broadcast].compute_at(s[result_ub], result_ub.op.axis[0])
    s[head_broadcast].compute_at(s[result_ub], result_ub.op.axis[0])

    _, c1, h, _, _ = result.op.axis

    if input_h + 2 * pad_h > 32 or input_w + 2 * pad_w > 32:
        h_outer, _ = s[result].split(h, 4)
        s[result_ub].compute_at(s[result], h_outer)
    else:
        s[result_ub].compute_at(s[result], c1)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [head, data, forward, dl_ddata],
                        "cce",
                        name="maxpool_ad_manual_schedule_no_overlap_all_max",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "maxpool_ad_manual_schedule_no_overlap_all_max"
        create_code(kernel_name, './', source_code)
    return mod
Пример #28
0
def maxpool_ad_manual_schedule_all_max(shape,
                                       kernel,
                                       stride,
                                       pad,
                                       dtype,
                                       polyhedral=True,
                                       attrs=None):
    """automatic differentiate of maxpool with manual schedule for all maximum."""
    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    pad_h, pad_w, _, _ = pad
    batch_size, input_c1, input_h, input_w, input_c0 = shape
    pad_shape = (batch_size, input_c1, input_h + 2 * pad_h,
                 input_w + 2 * pad_w, input_c0)
    out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
    out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1
    out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0)

    def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        in_data = inputs[0]

        data_separated_by_windows = (kernel_h, kernel_w, batch_size, input_c1,
                                     out_size_h, out_size_w, input_c0)

        pad_data = akg.tvm.compute(
            pad_shape,
            lambda b, c1, h, w, c0: akg.tvm.expr.Select(
                akg.tvm.all(h >= pad_h, h < input_h + pad_h, w >= pad_w, w <
                            input_w + pad_w),
                in_data(b, c1, h - pad_h, w - pad_w, c0),
                akg.tvm.const(0.0, dtype=dtype)),
            name="pad_data")

        data_reshaped = akg.tvm.compute(
            data_separated_by_windows,
            lambda wh, ww, b, c1, oh, ow, c0: pad_data(
                b, c1, oh * stride_h + wh, ow * stride_w + ww, c0),
            name="data_reshaped")

        max_broadcast = akg.tvm.compute(
            data_separated_by_windows,
            lambda wh, ww, b, c1, oh, ow, c0: out(b, c1, oh, ow, c0),
            name="max_broadcast")

        equal = akg.tvm.compute(
            data_separated_by_windows,
            lambda wh, ww, b, c1, oh, ow, c0: akg.tvm.expr.Select(
                max_broadcast(wh, ww, b, c1, oh, ow, c0) == data_reshaped(
                    wh, ww, b, c1, oh, ow, c0), head_(b, c1, oh, ow, c0),
                akg.tvm.const(0.0, dtype=dtype)),
            name="equal")

        data_reorg = akg.tvm.compute(
            (out_size_h, out_size_w, batch_size, input_c1, input_h + 2 * pad_h,
             input_w + 2 * pad_w, input_c0),
            lambda oh, ow, b, c1, h, w, c0: akg.tvm.expr.Select(
                akg.tvm.any(h < oh * stride_h, h > oh * stride_h + kernel_h -
                            1, w < ow * stride_w, w > ow * stride_w + kernel_w
                            - 1), akg.tvm.const(0, dtype=dtype),
                equal(h - oh * stride_h, w - ow * stride_w, b, c1, oh, ow, c0)
            ),
            name="data_reorg")

        result_pad = akg.topi.sum(data_reorg, [0, 1])

        result = akg.tvm.compute(shape,
                                 lambda b, c1, h, w, c0: result_pad(
                                     b, c1, h + pad_h, w + pad_w, c0),
                                 name="result")

        return [result]

    # tensor for the input data
    data = akg.tvm.placeholder(shape, dtype, name="input_data")

    # maxpool output
    forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype)

    # adjoint tensor for the differentiation
    head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype)

    # override differentiation computation with custom function
    [dl_ddata
     ] = akg.differentiate(forward, [data],
                           head,
                           None,
                           None,
                           override={forward: ([data], custom_maxpool_fdiff)})

    # schedule for differetiation operation
    s = akg.tvm.create_schedule([dl_ddata.op])

    # get computations
    result = dl_ddata
    result_pad = result.op.input_tensors[0]
    data_reorg = result_pad.op.input_tensors[0]
    equal = data_reorg.op.input_tensors[0]
    max_broadcast = equal.op.input_tensors[0]
    data_reshaped = equal.op.input_tensors[1]
    pad_data = data_reshaped.op.input_tensors[0]

    data_ub = s.cache_read(data, "local.UB", [pad_data])
    head_ub = s.cache_read(head, "local.UB", [equal])
    forward_ub = s.cache_read(forward, "local.UB", [max_broadcast])
    result_ub = s.cache_write(result, "local.UB")

    s[max_broadcast].set_scope("local.UB")
    s[data_reshaped].set_scope("local.UB")
    s[pad_data].set_scope("local.UB")
    s[equal].set_scope("local.UB")
    s[data_reorg].set_scope("local.UB")
    s[result_pad].set_scope("local.UB")

    s[data_ub].compute_inline()
    s[result_ub].compute_inline()
    s[pad_data].compute_inline()

    # equal dependencies
    s[forward_ub].compute_at(s[equal], equal.op.axis[0])
    s[max_broadcast].compute_at(s[equal], equal.op.axis[0])
    s[data_reshaped].compute_at(s[equal], equal.op.axis[0])
    s[head_ub].compute_at(s[equal], equal.op.axis[0])

    s[equal].compute_at(s[result_pad], result_pad.op.axis[0])

    # result dependencies
    s[data_reorg].compute_inline()
    b, c1, h, w, c0 = result_pad.op.axis
    oh, ow = result_pad.op.reduce_axis
    s[result_pad].reorder(oh, ow, b, c1, h, w, c0)

    b, c1, h, w, c0 = result.op.axis
    h_out, _ = s[result].split(h, stride_h)
    s[result_pad].compute_at(s[result], h_out)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [head, data, forward, dl_ddata],
                        "cce",
                        name="maxpool_ad_manual_schedule_all_max",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "maxpool_ad_manual_schedule_all_max"
        create_code(kernel_name, './', source_code)
    return mod
Пример #29
0
def conv_02(fmap_shape,
            filter_shape,
            pad_,
            stride_,
            dilation_,
            tile_hh=0,
            tile_coco=0,
            tile_mm=0,
            tile_kk=0,
            tile_nn=0,
            bypass_l1=False,
            use_bias=False,
            block_size=16,
            conv_dtype='float16'):

    # input shape (NCHW -> NC1HWC0)
    in_n, in_c, in_h, in_w = fmap_shape
    in_c = (in_c + block_size - 1) // block_size * block_size
    # kernel shape (NCHW -> NC1HWC0 -> Fractal)
    k_n, k_c, k_h, k_w = filter_shape
    k_c = (k_c + block_size - 1) // block_size * block_size
    k_n = (k_n + block_size - 1) // block_size * block_size

    input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size)
    in_n, _, in_h, in_w, _ = input_shape_nc1hwc0

    kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size)
    k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0
    kernel_shape_fractal = (k_c // block_size * k_h * k_w, k_n // block_size,
                            block_size, block_size)

    # A placeholder (NC1HWCO)
    A = akg.tvm.placeholder(input_shape_nc1hwc0,
                            dtype=conv_dtype,
                            name="input0")
    # B_placeholder (fractal)
    B = akg.tvm.placeholder(kernel_shape_fractal,
                            dtype=conv_dtype,
                            name="input1")

    if use_bias:
        bias_shape_nc1hwc0 = (1, k_n // block_size, 1, 1, block_size)
        bias_name = "input2"
        bias_value = akg.tvm.placeholder(bias_shape_nc1hwc0,
                                         dtype=conv_dtype,
                                         name=bias_name)
    else:
        bias_name = 'None'
        bias_value = None

    conv_forward = conv_compute_forward(fmap_shape, filter_shape, pad_,
                                        stride_, dilation_, A, B, bias_value,
                                        tile_hh, tile_coco, tile_mm, tile_kk,
                                        tile_nn, bypass_l1, use_bias,
                                        block_size, conv_dtype)

    k_hw = k_h * k_w
    const_shift = k_hw - 1

    # B in Fractal format; result in Fractal format
    def flip_weight(B, k_c, k_hw, const_shift):
        out_shape = (B.shape[1].value * k_hw, k_c // block_size, block_size,
                     block_size)
        B_flip = akg.tvm.compute(
            out_shape,
            lambda i0, i1, i2, i3: B[i1 * k_hw + const_shift - truncmod(
                i0, k_hw),
                                     floordiv(i0, k_hw), i3, i2],
            name=B.name + "_flipped")
        return B_flip

    # H in 5D format; result in 5D format
    def strided_head(H, s_h, s_w):
        n, c1, h, w, c0 = H.shape
        out_shape = (n, c1, (h - 1) * s_h + 1, (w - 1) * s_w + 1, c0)
        H_strided = akg.tvm.compute(
            out_shape,
            lambda i0, i1, i2, i3, i4: akg.tvm.expr.Select(
                akg.tvm.any(truncmod(i2, s_h) != 0,
                            truncmod(i3, s_w) != 0),
                akg.tvm.const(0.0, dtype="float16"), H[i0, i1,
                                                       floordiv(i2, s_h),
                                                       floordiv(i3, s_w), i4]),
            name=H.name + "_strided")

        return H_strided

    # A in 5D format; result in 5D format
    def transpose_data(A):
        out_shape = (A.shape[1].value * block_size,
                     A.shape[0].value // block_size, A.shape[2].value,
                     A.shape[3].value, block_size)

        A_transpose = akg.tvm.compute(
            out_shape,
            lambda j0, j1, j2, j3, j4: A[j1 * block_size + j4,
                                         floordiv(j0, block_size), j2, j3,
                                         truncmod(j0, block_size)],
            name=A.name + "_transposed")
        return A_transpose

    # Head is in 5D format; result in Fractal format
    def transpose_convert_head(Head):
        out_shape = ((Head.shape[0].value // block_size) *
                     Head.shape[2].value * Head.shape[3].value,
                     Head.shape[1].value, block_size, block_size)
        tmp_6D_shape = (Head.shape[0].value // block_size, block_size,
                        Head.shape[1].value, Head.shape[2].value,
                        Head.shape[3].value, block_size)
        Head_6D = akg.topi.reshape(Head, tmp_6D_shape)
        Head_6D_transpose = akg.topi.transpose(Head_6D, (0, 3, 4, 2, 5, 1))
        Head_transpose_convert = akg.topi.reshape(Head_6D_transpose, out_shape)
        return Head_transpose_convert

    HEAD = akg.tvm.placeholder(conv_forward.shape,
                               name="Head",
                               dtype='float16')
    Head_transposed_NCHW = (HEAD.shape[1].value * HEAD.shape[4].value,
                            HEAD.shape[0].value, HEAD.shape[2].value,
                            HEAD.shape[3].value)
    s_h, s_w = stride_
    Head_strided_NCHW = (HEAD.shape[0].value,
                         HEAD.shape[1].value * HEAD.shape[4].value,
                         (HEAD.shape[2].value - 1) * s_h + 1,
                         (HEAD.shape[3].value - 1) * s_w + 1)

    A_transposed_NCHW = (in_c, in_n, in_h, in_w)
    K_flip_rot_NCHW = (k_c, k_n, k_h, k_w)

    Head_transposed_converted = transpose_convert_head(HEAD)
    pld_Head_transposed_converted = akg.tvm.placeholder(
        Head_transposed_converted.shape,
        name="Head_trans_fractal",
        dtype=conv_dtype)
    A_transposed = transpose_data(A)
    pld_A_transposed = akg.tvm.placeholder(A_transposed.shape,
                                           name="A_trans",
                                           dtype=conv_dtype)

    info = dim.Dim()
    info.setdim(index=0, axis=0, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=1, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=2, tilel1=1, tilel0=1)
    info.setdim(index=0, axis=3, tilel1=1, tilel0=1)

    B_flip = flip_weight(B, k_c, k_hw, const_shift)
    pld_B_flipped = akg.tvm.placeholder(B_flip.shape,
                                        name="B_flip",
                                        dtype=conv_dtype)

    s_flipped = akg.tvm.create_schedule(B_flip.op)
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_weight_flipped = akg.build(s_flipped, [B, B_flip],
                                       "cce",
                                       name=B.name + "_flipped",
                                       attrs={"dim": str(info)},
                                       polyhedral=True)

    s_transposed_converted = akg.tvm.create_schedule(
        Head_transposed_converted.op)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_head_transposed_converted = akg.build(
            s_transposed_converted, [HEAD, Head_transposed_converted],
            "cce",
            name="H_trans_converted",
            attrs={"dim": str(info)},
            polyhedral=True)

    Head_strided = strided_head(HEAD, s_h, s_w)
    pld_Head_strided = akg.tvm.placeholder(Head_strided.shape,
                                           name="Head_trans_5D",
                                           dtype=conv_dtype)

    s_strided = akg.tvm.create_schedule(Head_strided.op)
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_head_strided = akg.build(s_strided, [HEAD, Head_strided],
                                     "cce",
                                     name="H_strided",
                                     attrs={"dim": str(info)},
                                     polyhedral=True)

    s_transposed = akg.tvm.create_schedule(A_transposed.op)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_transposed = akg.build(s_transposed, [A, A_transposed],
                                   "cce",
                                   name="A_transposed",
                                   attrs={"dim": str(info)},
                                   polyhedral=True)

    ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1}
    jacs = list(
        akg.differentiate(conv_forward, [A], HEAD, ad_attrs,
                          [pld_Head_strided, pld_B_flipped, None]))
    info = set_dims(Head_strided_NCHW, (k_c, k_n, k_h, k_w),
                    (k_h - 1, k_w - 1), (1, 1), (1, 1), tile_hh, tile_coco,
                    tile_mm, tile_kk, tile_nn, block_size)

    sjac = akg.tvm.create_schedule([jacs[0].op])
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_AD_data = akg.build(sjac,
                                [pld_Head_strided, pld_B_flipped, jacs[0]],
                                "cce",
                                name="conv_AD_data",
                                attrs={"dim": str(info)},
                                polyhedral=True)

    conv_data = conv_compute_forward(Head_strided_NCHW, K_flip_rot_NCHW,
                                     (k_h - 1, k_h - 1, k_w - 1, k_w - 1),
                                     (1, 1), (1, 1), pld_Head_strided,
                                     pld_B_flipped, None, tile_hh, tile_coco,
                                     tile_mm, tile_kk, tile_nn, bypass_l1,
                                     use_bias, block_size, conv_dtype)

    info = set_dims(Head_strided_NCHW, (k_c, k_n, k_h, k_w),
                    (k_h - 1, k_w - 1), (1, 1), (1, 1), tile_hh, tile_coco,
                    tile_mm, tile_kk, tile_nn, block_size)

    s_data = akg.tvm.create_schedule(conv_data.op)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        _ = akg.build(s_data, [pld_Head_strided, pld_B_flipped, conv_data],
                      "cce",
                      name="conv_data",
                      attrs={"dim": str(info)},
                      polyhedral=True)

    ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1}
    jacs = list(
        akg.differentiate(
            conv_forward, [B], HEAD, ad_attrs,
            [pld_A_transposed, pld_Head_transposed_converted, None]))
    info = set_dims(A_transposed_NCHW, Head_transposed_NCHW, (0, 0), (1, 1),
                    (s_h, s_w), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn,
                    block_size)

    sjac = akg.tvm.create_schedule([jacs[0].op])
    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod_AD_weight = akg.build(
            sjac, [pld_A_transposed, pld_Head_transposed_converted, jacs[0]],
            "cce",
            name="conv_AD_weight",
            attrs={"dim": str(info)},
            polyhedral=True)

    conv_weight = conv_compute_forward(
        A_transposed_NCHW, Head_transposed_NCHW, (0, 0, 0, 0), (1, 1),
        (s_h, s_w), pld_A_transposed, pld_Head_transposed_converted, None,
        tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, bypass_l1, use_bias,
        block_size, conv_dtype)

    info = set_dims(A_transposed_NCHW, Head_transposed_NCHW, (0, 0), (1, 1),
                    (s_h, s_w), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn,
                    block_size)

    s_weight = akg.tvm.create_schedule(conv_weight.op)

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        akg.build(
            s_weight,
            [pld_A_transposed, pld_Head_transposed_converted, conv_weight],
            "cce",
            name="conv_weight",
            attrs={"dim": str(info)},
            polyhedral=True)

    return mod_AD_data, mod_AD_weight, mod_transposed, mod_head_transposed_converted, mod_head_strided, mod_weight_flipped
Пример #30
0
def reduce_max_ad_optimized_manual_schedule(input_shape,
                                            dtype,
                                            axis,
                                            keepdims,
                                            polyhedral=True,
                                            attrs=None):
    def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        data_ = inputs[0]
        shape = data_.shape
        # reduces maximum value for each column
        max_ = akg.lang.ascend.reduce_max(data_, axis=axis, keepdims=True)
        # copies reduced values to get the original shape
        max_broadcast = akg.lang.ascend.broadcast(max_, shape)
        # head broadcast is needed to generate correct cce code for the selection operation
        head_broadcast = akg.tvm.compute(
            shape, lambda *indices: head_(*get_reduced_indices(
                *indices, axis=axis, keepdims=keepdims)))
        # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output
        max_values_and_zeros = akg.tvm.compute(
            shape,
            lambda *indices: akg.tvm.expr.Select(
                data_(*indices) == max_broadcast(*indices),
                head_broadcast(*indices), akg.tvm.const(0, dtype='float16')),
            name="reduce_max_ad2")
        # cast data back to the original dtype
        if dtype != 'float16':
            return [Cast(max_values_and_zeros, dtype, target=utils.CCE)]
        else:
            return [max_values_and_zeros]

    # tensor for the input data
    data = akg.tvm.placeholder(input_shape, dtype, name="input_data")

    # computation of reduce max
    # not used on the schedule because this is the diferentiation op
    l = reduce_max(data, axis, keepdims, target=utils.CCE)

    # adjoint tensor for the differentiation
    head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype)

    # cast input data
    if dtype != 'float16':
        data_cast = Cast(data, "float16", target=utils.CCE)
        head_cast = Cast(head, "float16", target=utils.CCE)
    else:
        data_cast = data
        head_cast = head

    # override differentiation computation with custom function
    [dl_ddata] = akg.differentiate(
        l, [data_cast],
        head_cast,
        None,
        None,
        override={l: ([data_cast], custom_reduce_max_fdiff)})

    # get tensors from custom function
    if dtype != 'float16':
        max_values_and_zeros = dl_ddata.op.input_tensors[0]
        max_broadcast = max_values_and_zeros.op.input_tensors[1]
        max_ = max_broadcast.op.input_tensors[0]
        head_broadcast = max_values_and_zeros.op.input_tensors[2]
    else:
        max_broadcast = dl_ddata.op.input_tensors[1]
        max_ = max_broadcast.op.input_tensors[0]
        head_broadcast = dl_ddata.op.input_tensors[2]

    # schedule for differetiation operation
    # inputs: data and head
    s = akg.tvm.create_schedule([dl_ddata.op])

    # cache reads of inputs
    if dtype != 'float16':
        head_ub = s.cache_read(head, "local.UB", [head_cast])
        data_ub = s.cache_read(data, "local.UB", [data_cast])
    else:
        # no cast operation
        head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast])
        data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata])

    # cache write for the output
    dl_ddata_ub = s.cache_write(dl_ddata, "local.UB")

    # get tiling attributes
    if attrs is None:
        raise Exception('attrs is None')
    tiling_factors = attrs['tile']
    split_iterators = []
    assert len(tiling_factors) == len(dl_ddata.shape)
    # split the final compute and save the iterators
    for index, factor in enumerate(tiling_factors):
        split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index],
                                                 factor))

    # get iterators
    iterator1 = split_iterators[0][0]

    # move computation of when there is a cast
    if dtype != "float16":
        s[data_cast].compute_at(s[dl_ddata], iterator1)
        s[data_cast].set_scope("local.UB")
        s[head_cast].compute_at(s[dl_ddata], iterator1)
        s[head_cast].set_scope("local.UB")
        s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1)
        s[max_values_and_zeros].set_scope("local.UB")

    # move cache reads and writes
    s[data_ub].compute_at(s[dl_ddata], iterator1)
    s[head_ub].compute_at(s[dl_ddata], iterator1)
    s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1)

    # move computation of the diferentiation
    s[max_].compute_at(s[dl_ddata], iterator1)
    s[max_].set_scope("local.UB")
    s[max_broadcast].compute_at(s[dl_ddata], iterator1)
    s[max_broadcast].set_scope("local.UB")
    s[head_broadcast].compute_at(s[dl_ddata], iterator1)
    s[head_broadcast].set_scope("local.UB")

    with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [head, data, dl_ddata],
                        "cce",
                        name="reduce_max_ad_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "reduce_max_ad_manual_schedule"
        create_code(kernel_name, './', source_code)
    return mod