Exemplo n.º 1
0
def logsoftmax_ad(shape, dtype, axis, kernel_name, attrs):
    """Compute the gradient of logsoftmax by autodiff."""
    check_list = ["float16"]
    if not dtype.lower() in check_list:
        raise RuntimeError("logsoftmax test only support %s while dtype is %s" % (",".join(check_list), dtype))
    # check_shape(shape)
    if axis < 0:
        axis = len(shape) + axis
    if axis >= len(shape):
        raise RuntimeError("axis should be less than dimension")
    if axis != len(shape) - 1:
        raise RuntimeError("Only support the last axis currently")

    shape_new = [shape[-2], shape[-1]]
    if len(shape) > 2:
        for i in range(len(shape) - 2):
            shape_new[0] = shape_new[0] * shape[i]
    shape = shape_new

    a_up = akg.tvm.placeholder(shape, dtype=dtype, name="input")
    b_up = logsoftmax.logsoftmax_op(a_up, shape, axis)

    head = akg.tvm.placeholder(b_up.shape, name="head", dtype=dtype)
    _jacs = list(akg.differentiate(b_up, [a_up], head))
    sjac = akg.tvm.create_schedule([_jacs[0].op])
    sjac[_jacs[0].op.input_tensors[1]].compute_inline()
    op_vars = [head, a_up, _jacs[0]]

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(sjac, op_vars, "cce", name="test2", attrs=attrs, polyhedral=True)
        return mod
Exemplo n.º 2
0
def invert_permutation_run(shape, dtype, attrs):
    # check shapes
    vc_util.check_shape(shape)

    if not (dtype.lower() in "int32"):
        raise RuntimeError(
            "indices_dtype only support int32 while dtype is %s" % dtype)

    A = akg.tvm.placeholder(shape, dtype, name="A")
    op = invert_permutation.invert_permutation(A)
    s = akg.tvm.create_schedule(op.op)

    kernel_name = utils.gen_name_kernel("invert_permutation", dtype, shape)
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [A, op],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)

    input_data = np.random.permutation(np.arange(shape[0])).astype(np.int32)
    expect = np.full([shape[0]], 0, np.int32)
    for i, e in enumerate(input_data):
        expect[e] = i

    output = np.full([shape[0]], 0, np.int32)
    output = utils.mod_launch(mod, (input_data, output), expect=expect)

    return (input_data, ), output, expect, compare_tensor(output,
                                                          expect,
                                                          rtol=5e-03,
                                                          equal_nan=True)
Exemplo n.º 3
0
def gather(params_shape,
           indices_shape,
           params_dtype,
           indices_dtype,
           axis,
           kernel_name,
           cce_path="./"):
    """Gather data by indices"""
    vc_util.check_shape(params_shape, length=2)
    vc_util.check_shape(indices_shape, length=1)
    vc_util.ops_dtype_check(params_dtype, vc_util.DtypeForDavinci.ALL_TYPES)
    vc_util.ops_dtype_check(indices_dtype, vc_util.DtypeForDavinci.INT32)
    vc_util.check_equal("axis", "zero", axis, 0)

    # construct compute
    o_shape = (indices_shape[0], params_shape[1])
    xx = akg.tvm.placeholder(params_shape, dtype=params_dtype, name="X")
    yy = akg.tvm.placeholder(indices_shape, dtype=indices_dtype, name="Y")
    res = akg.tvm.extern(o_shape, [xx, yy],
                         lambda ins, outs: kernel_ir(outs[0], ins[0], ins[1]),
                         name="res",
                         dtype=params_dtype)
    s = akg.tvm.create_schedule(res.op)

    # create cce
    attrs = {"enable_multicore": False}
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [xx, yy, res], "cce", name=kernel_name, attrs=attrs)

    source_code = mod.imported_modules[0].get_source()
    utils.create_code(kernel_name, cce_path, source_code)

    return mod
Exemplo n.º 4
0
def focalloss_ad_run2(shape, dtype, attrs):
    logits_pld = akg.tvm.placeholder(shape, dtype=dtype, name='logits')
    labels_pld = akg.tvm.placeholder(shape, dtype='int32', name='labels')
    d_labels, d_logits, head = focalloss_ad.focalloss_ad(
        labels_pld, logits_pld)
    print("autodiff d_logits:\n", akg.tvm.PrintTensorRecursively(d_logits))
    print("autodiff d_labels:\n", akg.tvm.PrintTensorRecursively(d_labels))

    # build autodiff kernels
    io = [labels_pld, logits_pld, head, d_labels, d_logits]
    s = akg.tvm.create_schedule([e.op for e in io])
    kernel_name = utils.gen_name_kernel("focalloss_ad", dtype, (
        shape[0],
        shape[1],
    ))
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s,
                        io,
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)

    labels_np = RANGEFILL((batchsize, ))
    logits_np = RANGEFILL((batchsize, ), 2)
    head_np = RANGEFILL((batchsize, ), 2)
    output = np.full(expect.shape, np.nan, dtype)
    output = utils.mod_launch(mod, (labels_np, logits_np, head_np, output),
                              expect=output)
    expect = output  # hack

    return (input_np, head_np), output, expect, compare_tensor(output,
                                                               expect,
                                                               atol=0.1)
Exemplo n.º 5
0
Arquivo: topk.py Projeto: zhuyawen/akg
def topk(shape, k, dtype, kernel_name, attrs):
    check_list = ["float16", "int32"]
    if not (dtype.lower() in check_list):
        raise RuntimeError("tile_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))
    if k > shape[-1]:
        raise RuntimeError("k should not be greater than shape[-1]")

    shape = (16, 16)
    out_shape = (16, 16)
    temp_shape = (16, 16 * 18)
    inputs = akg.tvm.placeholder(shape, name="input", dtype="float16")
    output = akg.tvm.placeholder(out_shape, name="output", dtype="float16")
    temp = akg.tvm.placeholder(temp_shape, name="temp", dtype="float16")

    values = compute_topk(output, inputs, temp)
    values1 = compute_get_last(values, temp)

    s = akg.tvm.create_schedule([values1.op])
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [inputs, values1],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
        return mod
Exemplo n.º 6
0
def matmul_ad(data_shape, weight_shape, dtype, attrs=None):
    check_list = ["float16"]
    if not (dtype.lower() in check_list):
        raise RuntimeError("matmul test only support %s while dtype is %s" %
                           (",".join(check_list), dtype))
    # check_shape(shape)
    assert (len(data_shape) == 2)
    assert (len(weight_shape) == 2)
    assert (data_shape[1] == weight_shape[0])

    m, k = data_shape
    _, n = weight_shape

    a = akg.tvm.placeholder((m, k), name='a', dtype=dtype)
    b = akg.tvm.placeholder((k, n), name='b', dtype=dtype)
    kk = akg.tvm.reduce_axis((0, k), name='kk')
    c = akg.tvm.compute(
        (m, n),
        lambda i, j: akg.lang.cce.mmad(a[i, kk] * b[kk, j], axis=kk),
        name="c")

    head = akg.tvm.placeholder(c.shape, name="Head", dtype='float16')
    _jacs = list(akg.differentiate(c, [a], head))
    sjac = akg.tvm.create_schedule([_jacs[0].op])
    op_vars = [head, b, _jacs[0]]

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(sjac,
                        op_vars,
                        "cce",
                        name="test2",
                        attrs=attrs,
                        polyhedral=True)
        return mod
Exemplo n.º 7
0
def globalavgpool(n, c, h, w, pool_type, attrs, kernel_name="global_pool"):
    """
    Performs the global average pooling on the input. For each feature map we can define the formula as:
    \f[
     res = \frac{1}{W * H} \\sum X_{i,j}
    \f]
    Note:
        The real input is create by akg.tvm.placeholder
    Args:
        n (int): input batchsize.
        c (int): input channel.
        h (int): input height.
        w (int): input weight.
        pool_type (str): pooling mode, default average.
        attrs (str): Default None.
        kernel_name (str): a str about kernel_name

    Returns:
            tvm.tensor.Tensor of shape n * c * 1 * 1
    """

    input = akg.tvm.placeholder((n, c, h, w), name='input', dtype="float16")
    output = akg.topi.nn.global_pool(input, pool_type=pool_type)
    s = akg.tvm.create_schedule(output.op)
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [input, output],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
    return mod
Exemplo n.º 8
0
def op_build_to_func(opnames, computes, args, custom_schedule, device, kernel_name, attrs):
    """op_build_to_func"""
    if device not in ("aicore", "aicpu"):
        logging.error("Device %s is not in [aicore, aicpu].", device)
        return None

    polyhedral = True
    dump_ir = os.getenv(MS_AKG_DUMP_IR) == "on"

    try:
        tmp_outputs = [x.op for x in computes]
        s = akg.tvm.create_schedule(tmp_outputs)
        if custom_schedule:
            polyhedral = False
            custom_schedule(s)

        with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=dump_ir):
            if attrs:
                binds = attrs.pop(BINDS, None)
                rst = akg.build_to_func(s, args, name=kernel_name, attrs=attrs, polyhedral=polyhedral,
                                        binds=binds, target=device)
            else:
                rst = akg.build_to_func(s, args, name=kernel_name, polyhedral=polyhedral, target=device)

    except Exception:
        logging.error(traceback.format_exc())
        return None
    return rst
Exemplo n.º 9
0
def case_1(data_shape, dtype, kernel_name, attrs):
    """elemwise chain case 1"""
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16)
    vc_util.check_shape_length_equal("data", data_shape, 2)

    m, k = data_shape

    A = akg.tvm.placeholder((m, k), name='A', dtype=dtype)
    B = akg.tvm.placeholder((k, ), name='B', dtype=dtype)
    C = akg.tvm.placeholder((m, k), name='C', dtype=dtype)

    E = akg.tvm.compute((m, k),
                        lambda i, j: A[i, j] * (B[j] + C[i, j]),
                        name="E")

    forward_s = akg.tvm.create_schedule(E.op)
    op_vars = [A, B, C, E]
    forward_low = akg.lower(forward_s,
                            op_vars,
                            simple_mode=True,
                            polyhedral=True)

    kernel_name = utils.gen_name_kernel(kernel_name, dtype, data_shape)

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(forward_s,
                        op_vars,
                        "cce",
                        name="test",
                        attrs=attrs,
                        polyhedral=True)
        source_code = mod.imported_modules[0].get_source()
        return mod
Exemplo n.º 10
0
def test_CCE_Conv(FMap_shape, Filter_shape, Pad, Stride,
                  Tile_h=0, Tile_co=0, Tile_m=0, Tile_k=0, Tile_n=0,
                  use_bias=False, fp32_mad = True, kernel_name="conv"):

    # adjust to TilingApi
    # feature map (NCHW -> NC1HWC0)
    fmap_n, fmap_c, fmap_h, fmap_w = FMap_shape
    fmap_shape_NC1HWCO = (fmap_n, fmap_c // block_size, fmap_h, fmap_w, block_size)

    # filter (NCHW -> C1HWNC0)
    filter_n, filter_c, filter_h, filter_w = Filter_shape
    filter_shape_C1HWNC0 = (filter_c // block_size, filter_h, filter_w, filter_n, block_size)
    # filter (C1HWNC0 -> filter_fractal)
    filter_shape_fractal = (
        filter_c * filter_h * filter_w // block_size, filter_n // block_size, block_size, block_size)

    # stride (stride_h, stride_w)
    stride = Stride

    # fmap_placeholder (NC1HWCO)
    fmap_placeholder = akg.tvm.placeholder(fmap_shape_NC1HWCO, dtype=conv_dtype, name='fmap')
    # filter_placeholder (fractal)
    filter_placeholder = akg.tvm.placeholder(filter_shape_fractal, dtype=conv_dtype, name='filter')

    if use_bias:
        bias_shape = (1, filter_n // block_size, 1, 1, block_size)
        bias_placeholder = akg.tvm.placeholder(bias_shape, dtype= conv_dtype, name='bias')
        conv_dsl_input = (fmap_placeholder, filter_placeholder, bias_placeholder)
    else:
        conv_dsl_input = (fmap_placeholder, filter_placeholder)

    conv_dsl_outputs = conv_dsl(conv_dsl_input, fmap_shape_NC1HWCO, filter_shape_C1HWNC0, Pad, stride, use_bias, fp32_mad)

    # calculate the tiling factor.
    Wo = (fmap_w + Pad[2] + Pad[3] - filter_w) // (stride[1]) + 1
    H_tiling = (Tile_h - filter_h) // (stride[0]) + 1

    # For adjusting to TilingApi, here are some tiling factor changes.
    # tiling_factor_h occurs in L1, and Tile_n is means the n in 'nchw', so we need translate it to H_tiling
    # used as Ho in A_im2col_row_major_shape
    # others are similar, they need to be changed to format where them are used.
    tiling_factor_h = H_tiling * Wo // block_size * block_size
    tiling_factor_co = Tile_co // block_size
    tiling_factor_m = Tile_m // block_size * block_size
    tiling_factor_n = Tile_n // block_size
    tiling_factor_k = Tile_k // block_size

    # schedule
    # pick the last one as the final result
    s = akg.tvm.create_schedule(conv_dsl_outputs[-1].op)


    conv_sch(s, (conv_dsl_input, conv_dsl_outputs), tiling_factor_h=tiling_factor_h,
             tiling_factor_m=tiling_factor_m, tiling_factor_k=tiling_factor_k, tiling_factor_n=tiling_factor_n)

    args = list(conv_dsl_input) + [conv_dsl_outputs[-1]]
    with akg.build_config(add_lower_pass = cce.debug_mode(0), dump_pass_ir = True):
        mod = akg.build(s, args, "cce", name=kernel_name, attrs= {"loop_partition_unroll": True})
        return mod
Exemplo n.º 11
0
def roipool(shape,
            roibox,
            pooled_shape,
            dtype,
            kernel_name="roipool_forward_output",
            attrs=None):
    check_list = ["float16"]
    if not (dtype.lower() in check_list):
        raise RuntimeError("tile_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))
    vc_util.check_shape(shape)
    assert (len(shape) == 4)
    assert (len(roibox) == 4)
    assert (len(pooled_shape) == 2)

    a_n, a_c, a_h, a_w = shape
    roi_t, roi_b, roi_l, roi_r = roibox
    assert (roi_t >= 0 and roi_t < roi_b and roi_b < a_h)
    assert (roi_l >= 0 and roi_l < roi_r and roi_r < a_w)

    a = akg.tvm.placeholder(shape, name="a", dtype=dtype)
    Crop = akg.tvm.compute([a_n, a_c, roi_b - roi_t, roi_r - roi_l],
                           lambda n, c, h, w: a[n, c, roi_t + h, roi_l + w])

    p_h, p_w = pooled_shape
    win_h = (roi_b - roi_t) // p_h + (1 if (roi_b - roi_t) % p_h > 0 else 0)
    win_w = (roi_r - roi_l) // p_w + (1 if (roi_r - roi_l) % p_w > 0 else 0)

    assert p_h <= (roi_b - roi_t) and p_w <= (roi_r - roi_l)

    Unpooled = akg.tvm.compute(
        [a_n, a_c, p_h, p_w, win_h, win_w],
        lambda n, c, h, w, wh, ww: akg.tvm.expr.Select(
            akg.tvm.all(h * win_h + wh < roi_b - roi_t, w * win_w + ww < roi_r
                        - roi_l), Crop[n, c, h * win_h + wh, w * win_w + ww],
            akg.tvm.const(0, a.dtype)))

    rh = akg.tvm.reduce_axis((0, win_h))
    rw = akg.tvm.reduce_axis((0, win_w))
    output_shape = [a_n, a_c, p_h, p_w]
    res = akg.tvm.compute(
        output_shape,
        lambda n, c, h, w: akg.tvm.max(Unpooled[n, c, h, w, rh, rw],
                                       axis=[rh, rw]))
    s = akg.tvm.create_schedule(res.op)
    s[Crop].compute_inline()
    s[Unpooled].compute_inline()
    kernel_name = utils.gen_name_kernel(kernel_name, dtype, shape)
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [a, res],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
        return mod, output_shape
Exemplo n.º 12
0
def test_select():
    N = 128

    actual = akg.tvm.placeholder((N, ), name='actual', dtype='int32')
    predict = akg.tvm.placeholder((N, ), name='predict', dtype='int32')
    k = akg.tvm.reduce_axis((0, N), name='k')
    output = akg.tvm.compute(
        (N, N), lambda i, j: akg.tvm.sum(akg.tvm.expr.Select(
            akg.tvm.all(i == actual[k], j == predict[k]), 1.0, 0.0),
                                         axis=k))

    s = akg.tvm.create_schedule(output.op)

    # build the cce kernel
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [actual, predict, output], "cce", polyhedral=True)
Exemplo n.º 13
0
def concat_ad_run(shapes, dtype, axis, attrs):
    # prepare inputs placeholder
    inp_dtype = dtype.lower()
    data = []
    for i in range(len(shapes)):
        shape = shapes[i]
        data.append(
            akg.tvm.placeholder(shape, name="data_%d" % i, dtype=inp_dtype))

    kernel_name = utils.genKernelName("concat", inp_dtype, shapes)
    res, head = concat_ad.concat_ad(data, axis)

    opvars = [head] + data + [res]
    s = akg.tvm.create_schedule(res.op)
    op_attrs = [axis]

    if 'tuning' in attrs.keys():
        t = attrs.get("tuning", False)
        kernel_name = attrs.get("kernel_name", False)
        mod = utils.op_build_test(concat_ad.concat_ad, [shapes],
                                  [dtype.lower()],
                                  op_attrs,
                                  kernel_name=kernel_name,
                                  attrs=attrs,
                                  tuning=t)
        if t:
            args, expect, head_data, inputs = gen_data(dtype, head, shapes)
            return mod, expect, tuple(args)
        else:
            return mod
    else:
        # build the cce kernel
        with akg.build_config(add_lower_pass=cce.debug_mode(0),
                              dump_pass_ir=True):
            mod = akg.build(s,
                            opvars,
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
        print(mod.imported_modules[0].get_source())

        args, expect, head_data, inputs = gen_data(dtype, head, shapes)
        output = utils.mod_launch(mod, tuple(args), expect=expect)
        return tuple(inputs) + (head_data, ), output, expect, compare_tensor(
            output, expect, rtol=5e-03, equal_nan=True)
Exemplo n.º 14
0
def floormod(shape, dtype, kernel_name, attrs):
    """
    Compute element-wise remainder of division.
    \f$res=a - floor(a/b) * b\f$

    Args:
         shape (list): a list has any nums.
         dtype (str): parameters' type.
         kernel_name (str): a str about kernel_name.
         attrs (str): Default None.
    Returns:
            tvm.tensor.Tensor, shape and dtype are input params.
    """

    vc_util.ops_dtype_check(
        dtype,
        [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32])
    vc_util.check_shape(shape)

    a = akg.tvm.placeholder(shape=shape, name="a", dtype=dtype)
    b = akg.tvm.placeholder(shape=shape, name="b", dtype=dtype)

    # res = a - floor(a/b) * b
    # Newton's Method for VREC
    para = akg.lang.cce.vrec(b)
    for _ in range(3):
        tmp1 = akg.lang.cce.vmul(b, para)
        tmp2 = akg.lang.cce.vmuls(tmp1, -1)
        tmp3 = akg.lang.cce.vadds(tmp2, 2)
        para = akg.lang.cce.vmul(tmp3, para)

    c = akg.lang.cce.vmul(a, para)
    d = akg.lang.cce.floor(c)
    e = akg.lang.cce.vmul(d, b)
    res = akg.lang.cce.vsub(a, e)

    s = akg.tvm.create_schedule(res.op)

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [a, b, res],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
        return mod
Exemplo n.º 15
0
def test_vmadd():
    shape = (10, 256)
    dtype = 'float16'

    x = akg.tvm.placeholder(shape, name="x", dtype=dtype)

    def compute_func(*indices):
        y = x(*indices) + akg.tvm.const(2.0, dtype)
        return y * x(*indices) + x(*indices) + akg.tvm.const(1.0, dtype)

    res = akg.tvm.compute(shape, compute_func)

    s = akg.tvm.create_schedule(res.op)

    # build the cce kernel
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [x, res], "cce", polyhedral=True)

    assert "vmadd" in mod.imported_modules[0].get_source()
Exemplo n.º 16
0
def test_quant(fmap_shape):
    # input shape(NCHW -> NC1HWC0)
    in_n, in_c, in_h, in_w = fmap_shape
    assert in_c % 32 == 0
    input_shape_nc1hwc0 = (in_n, in_c // 16, in_h, in_w, 16)
    in_n, in_c1, in_h, in_w, in_c0 = input_shape_nc1hwc0

    # placeholder (NC1HWC0)
    FMap = akg.tvm.placeholder(input_shape_nc1hwc0,
                               dtype='float16',
                               name='FMap')

    ScaleQ = akg.tvm.placeholder((16, ), dtype='float16', name='ScaleQ')
    OffsetQ = akg.tvm.placeholder((16, ), dtype='float16', name='OffsetQ')

    out_shape_nc1hwc0 = (in_n, in_c // 32, in_h, in_w, 32)
    print(out_shape_nc1hwc0)
    out_n, out_c1, out_h, out_w, out_c0 = out_shape_nc1hwc0

    # quantize
    Quant = akg.tvm.compute(out_shape_nc1hwc0,
                            lambda n, c1, h, w, c0:
                            (FMap[n, c1 + c0 // 16, h, w, c0 % 16] * ScaleQ[0]
                             + OffsetQ[0]).astype('int8'),
                            name='output')

    info = dim.Dim()
    info.setdim(index=0, axis=0, tilel1=2, tilel0=0)
    info.setdim(index=0, axis=0, tilel1=32, tilel0=0)
    info.setdim(index=0, axis=0, tilel1=32, tilel0=0)
    info.setdim(index=0, axis=0, tilel1=16, tilel0=0)

    # schedule
    s = akg.tvm.create_schedule(Quant.op)
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [FMap, ScaleQ, OffsetQ, Quant],
                        'cce',
                        name='cce_quant',
                        attrs={'dim': str(info)},
                        polyhedral=True)

    source_code = mod.imported_modules[0].get_source()
    print(source_code)
Exemplo n.º 17
0
def reduce_min_ad_optimized_manual_schedule(input_shape,
                                            dtype,
                                            axis,
                                            keepdims,
                                            polyhedral=True,
                                            attrs=None):
    def get_shape(pld):
        return [d.value for d in pld.shape]

    data = akg.tvm.placeholder(input_shape, dtype, name="input_data")

    #only works for last axis and 2D. Need to extend to multiple dimension and axes.
    def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array):
        data = inputs[0]
        shape = get_shape(data)
        if len(get_shape(data)) == 2:
            # add an extra stage to avoid alignment problem
            min_input = akg.tvm.compute(data.shape,
                                        lambda *i: data(*i),
                                        name="min_input")
            min_ = akg.lang.cce.reduce_min(min_input, axis=-1, keepdims=True)
            min_broadcast = akg.lang.cce.broadcast(min_, shape)
            if dtype != "float16":
                data = cast(data, "float16")
            return [
                akg.tvm.compute(shape,
                                lambda i, j: akg.tvm.expr.Select(
                                    data[i, j] == min_broadcast[i, j], grad[i],
                                    akg.tvm.const(0, dtype="float16")),
                                name="reduce_min_ad2")
            ]

    L = reduce_min.reduce_min(data, axis)
    head = akg.tvm.placeholder(L.shape, name="head", dtype=L.dtype)
    head_cast = cast(head, "float16")

    [dL_ddata
     ] = akg.differentiate(L, [data],
                           head_cast,
                           None,
                           None,
                           override={L: ([data], custom_reduce_min_fdiff)})

    s = akg.tvm.create_schedule([dL_ddata.op])

    head_ub = s.cache_read(head, "local.UB", [head_cast])
    if dtype == "float16":
        data_ub = s.cache_read(data, "local.UB", [dL_ddata])
    else:
        data_ub = s.cache_read(data, "local.UB",
                               [dL_ddata.op.input_tensors[0]])
        min_input_ub = s.cache_read(
            dL_ddata.op.input_tensors[1].op.input_tensors[0].op.
            input_tensors[0].op.input_tensors[0].op.input_tensors[0],
            "local.UB", [
                dL_ddata.op.input_tensors[1].op.input_tensors[0].op.
                input_tensors[0].op.input_tensors[0]
            ])
        s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.input_tensors[0].
          op.input_tensors[0]].set_scope("local.UB")

    dL_ddata_ub = s.cache_write(dL_ddata, "local.UB")

    # tiling
    split_axis = {}
    for i in range(len(attrs['tile'])):
        split_axis["axis" + str(i)] = s[dL_ddata].split(
            dL_ddata.op.axis[i], attrs["tile"][i])

    split_axis_sorted = sorted(split_axis.items())

    if dtype == "float16":
        s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    else:
        s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
        s[dL_ddata.op.input_tensors[0]].compute_at(s[dL_ddata],
                                                   split_axis_sorted[-1][1][0])
        s[dL_ddata.op.input_tensors[0]].set_scope("local.UB")
        s[min_input_ub].compute_at(s[dL_ddata], split_axis_sorted[0][1][1])

    s[head_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    s[head_cast].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    s[head_cast].set_scope("local.UB")
    s[dL_ddata.op.input_tensors[1]].compute_at(s[dL_ddata],
                                               split_axis_sorted[-1][1][0])
    s[dL_ddata.op.input_tensors[1]].set_scope("local.UB")
    s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].compute_at(
        s[dL_ddata], split_axis_sorted[0][1][1])
    s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].set_scope("local.UB")
    s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.
      input_tensors[0]].compute_at(s[dL_ddata], split_axis_sorted[0][1][1])
    s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.
      input_tensors[0]].set_scope("local.UB")

    # L is not being used for computation
    # s[L].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])
    # s[L].set_scope("local.UB"1

    s[dL_ddata_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0])

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, head, dL_ddata],
                        "cce",
                        name="reduce_min_ad_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "reduce_min_ad_manual_schedule"
        utils.create_code(kernel_name, './', source_code)
    return mod
Exemplo n.º 18
0
def im2col_manual_schedule(shape, kernel, stride, pad, dtype, polyhedral=True, attrs=None):
    '''
    Compute im2col via cce im2col intrin function call directly

    Args:
        shape: shape of the data
        kernel: kernel sizes for im2col
        stride: stride sizes for im2col
        pad: padding sizes for im2col, including padding top, bottom, left, and right
        dtype: type of the data

    Return:
        cce intrin function call for im2col
    '''

    load3d = intrin_load3d(dtype)

    b, c1, h, w, c0 = shape
    stride_h, stride_w = stride
    kernel_h, kernel_w = kernel
    pad_t, pad_b, pad_l, pad_r = pad
    dilation_w, dilation_h = 1, 1
    jump_offset = 1
    repeat_mode = 0
    repeat_time = 1
    csize = 0
    block_size = 16

    # output size <=> number of windows
    ho = (h + pad_b + pad_t - kernel_h) // stride_h + 1
    wo = (w + pad_r + pad_l - kernel_w) // stride_w + 1

    im2col_shape = (b,
                    (ho * wo + block_size - 1) // block_size,
                    c1 * kernel_h * kernel_w,
                    block_size,
                    c0)

    def _im2col_compute(i, j, k, data):

        j_h = (((j*block_size) // wo)*stride_h)-pad_t
        j_w = (((j*block_size) %  wo)*stride_w)-pad_l

        # num rows in l1 for fmatrix is discounted by the amount of bottom padding
        h_3d         = kernel_h - tvm.max(((j_h+kernel_h) - h), 0)
        pad_t_3d     = tvm.max(-j_h, 0)
        pad_b_3d     = tvm.max(((j_h+kernel_h) - h), 0)
        w_idx_kernel = (k % kernel_w)
        h_idx_kernel = ((k // kernel_w) % kernel_h)
        w_idx        = j_w
        # when this is < 0, the slice will start from row 0 so there is no redundancy between base address and this param
        h_idx        = tvm.min(j_h, 0)
        c1_idx = (k // kernel_w) // kernel_h

        load3d_input = data[i,
                            c1_idx,
                            # assume padding < kernel size
                            tvm.max(0, j_h):tvm.min(h, j_h+kernel_h),
                            0:w,
                            0:c0]

        return load3d(load3d_input,
                      w, h_3d, pad_l, pad_r, pad_t_3d, pad_b_3d,
                      w_idx_kernel, h_idx_kernel, w_idx, h_idx, 0,
                      stride_w, stride_h, kernel_w, kernel_h, dilation_w, dilation_h, jump_offset, repeat_mode, repeat_time,
                      csize)

    # tensor for the input data
    data = tvm.placeholder(shape, dtype, name="input_data")

    # assume we need the whole width of a
    # choose a section of the rows of a that encompasses all of the windows in the current window-batch
    res = tvm.compute(im2col_shape,
                      lambda i, j, k:
                          _im2col_compute(i, j, k, data),
                      name='im2col_fractal')

    # schedule for differentiation operation
    s = tvm.create_schedule([res.op])

    data_ub = s.cache_read(data, "local.L1", [res])
    res_ub = s.cache_write(res, "local.UB")

    s[data_ub].compute_at(s[res], res.op.axis[0])
    s[res_ub].compute_at(s[res], res.op.axis[2])

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, res], "cce", name="im2col_manual_schedule",
                       attrs=attrs, polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "im2col_manual_schedule"
        utils.create_code(kernel_name, './', source_code)
    return mod
Exemplo n.º 19
0
def maxpool_manual_schedule(shape,
                            kernel,
                            stride,
                            padding,
                            dtype,
                            attrs=None,
                            polyhedral=False):
    """maxpool with manual schedule"""
    vc_util.davinci_format_check(shape, "NC1HWC0", dim=5)
    vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT)

    maxpool_param_check(kernel, stride, padding)

    data = akg.tvm.placeholder(shape, dtype, name="input_data")
    batch_size, in_c1, input_h, input_w, in_c0 = data.shape

    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    if len(padding) == 2:
        pad_h, pad_w = padding
    elif len(padding) == 4:
        pad_h, pad_w = padding[0], padding[2]

    out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
    out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1

    # padding operation
    if pad_h != 0 or pad_w != 0:
        pad_shape = (batch_size, in_c1, input_h + 2 * pad_h,
                     input_w + 2 * pad_w, in_c0)

        padded_input = akg.tvm.compute(
            pad_shape,
            lambda n, c1, h, w, c0: akg.tvm.if_then_else(
                akg.tvm.any(
                    h > input_h + pad_h - 1,
                    h < pad_h,
                    w > input_w + pad_w - 1,
                    w < pad_w,
                ),
                akg.tvm.const(0.0, dtype=dtype),
                data[n, c1, h - pad_h, w - pad_w, c0],
            ),
            name="padded_input")
    else:
        padded_input = data

    # reduce iterators
    it_kernel_h = akg.tvm.reduce_axis((0, kernel_h),
                                      name="iterator_reduction_height")
    it_kernel_w = akg.tvm.reduce_axis((0, kernel_w),
                                      name="iterator_reduction_width")

    out_shape = (batch_size, in_c1, out_size_h, out_size_w, in_c0)

    res = akg.tvm.compute(out_shape,
                          lambda n, c1, h, w, c0: akg.tvm.max(
                              padded_input[n, c1, (h * stride_h + it_kernel_h),
                                           (w * stride_w + it_kernel_w), c0],
                              axis=[it_kernel_h, it_kernel_w]),
                          name="maxpool_not_hybrid")

    s = akg.tvm.create_schedule([res.op])

    if pad_w != 0 or pad_h != 0:
        padded_input = res.op.input_tensors[0]
    else:
        padded_input = res

    # cache reads and writes
    # after this cache write: reference to res_ub to change the reduction axis
    res_ub = s.cache_write(res, "local.UB")
    if pad_w != 0 or pad_h != 0:
        data_ub = s.cache_read(data, "local.UB", [padded_input])
    else:
        data_ub = s.cache_read(data, "local.UB", [res_ub])

    # get tiling attributes
    if attrs is None:
        raise Exception('attrs is None')
    tiling_factors = attrs['tile']
    split_iterators = []
    if len(tiling_factors) != len(res.shape):
        raise RuntimeError("tiling factors mismatch out shape")
    # split the final compute and save the iterators
    for index, factor in enumerate(tiling_factors):
        split_iterators.append(s[res_ub].split(res_ub.op.axis[index], factor))

    # get iterators
    iterator_b_outer = split_iterators[0][0]
    iterator_b_inner = split_iterators[0][1]
    iterator_c1_outer = split_iterators[1][0]
    iterator_c1_inner = split_iterators[1][1]
    iterator_h_outer = split_iterators[2][0]
    iterator_h_inner = split_iterators[2][1]
    iterator_w_outer = split_iterators[3][0]
    iterator_w_inner = split_iterators[3][1]
    iterator_c0_outer = split_iterators[4][0]
    iterator_c0_inner = split_iterators[4][1]
    # reduction axis
    iterator_reduce_h = res_ub.op.reduce_axis[0]
    iterator_reduce_w = res_ub.op.reduce_axis[1]

    # move caches
    s[res_ub].compute_at(s[res], res.op.axis[0])
    s[data_ub].compute_at(s[res_ub], iterator_c1_outer)

    if pad_w != 0 or pad_h != 0:
        s[padded_input].compute_at(s[res_ub], iterator_c1_outer)
        s[padded_input].set_scope("local.UB")

    # reorder computation
    s[res_ub].reorder(iterator_b_outer, iterator_b_inner, iterator_c1_outer,
                      iterator_c1_inner, iterator_h_outer, iterator_h_inner,
                      iterator_w_outer, iterator_w_inner, iterator_reduce_h,
                      iterator_reduce_w, iterator_c0_outer, iterator_c0_inner)

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, res],
                        "cce",
                        name="maxpool_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "maxpool_ad_manual_schedule"
        utils.create_cce(kernel_name, './', source_code)
    return mod
Exemplo n.º 20
0
def col2im_manual_schedule(shape,
                           kernel,
                           stride,
                           pad,
                           dtype,
                           output_H_W,
                           polyhedral=True,
                           attrs=None):
    """
    Col2im operation with manual schedule.

     Args:
        shape (Union[list, tuple]): seven int numbers for the input's image size.
        kernel (Union[list, tuple]): two int numbers for the sliding window's size.
        stride (Union[list, tuple]): two int numbers for the sliding window's stride.
        pad: (Union[list, tuple]): four int numbers for padding's sizes: top, bottom, left, and right
        dtype (str): parameters' type.
        output_H_W (Union[list, tuple]): two int numbers for the output's height and width.
        polyhedral (bool): If True, use auto-schedule, else use manual-schedule, default value is True.
        attrs (dict): Specifies parameters used in manual-schedule.

    Returns:
        tvm.tensor.Tensor as result for col2im operation.
    """

    N, C1, KH, KW, OH, OW, C0 = shape
    H, W = output_H_W
    output_shape = (N, C1, H, W, C0)
    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    pad_t, pad_b, pad_l, pad_r = pad

    assert H == (OH - 1) * stride_h + kernel_h - (
        pad_t + pad_b), "Height of input and output do not match"
    assert W == (OW - 1) * stride_w + kernel_w - (
        pad_l + pad_r), "Width of input and output do not match"

    col2im = intrin_col2im(shape, output_shape, kernel, stride, pad, dtype)

    # tensor for the input data
    data = tvm.placeholder(shape, dtype, name="input_data")

    # assume we need the whole width of A
    # choose a section of the rows of A that encompasses all of the windows in the current window-batch
    res = tvm.compute(output_shape,
                      lambda b, c1, h, w, c0: data(b, c1, h % KH, w % KW, h %
                                                   OH, w % OW, c0),
                      name="col2im_intrinsic")

    # schedule for differetiation operation
    s = tvm.create_schedule([res.op])

    res_ub = s.cache_write(res, "local.UB")
    data_ub = s.cache_read(data, "local.UB", [res_ub])

    b, c1, h, w, c0 = res.op.axis

    s[data_ub].compute_at(s[res], c1)
    s[res_ub].compute_at(s[res], c1)

    s[res_ub].tensorize(res_ub.op.axis[0], col2im)

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [data, res],
                        "cce",
                        name="col2im_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "col2im_manual_schedule"
        utils.create_code(kernel_name, "./", source_code)
    return mod
Exemplo n.º 21
0
def vector_matmul(data_m, data_n, data_k, trans_a, trans_b, dtype, kernel_name,
                  attrs):
    check_list = ["float16", "float32"]
    if not dtype in check_list:
        raise TypeError("softmax test only support %s while dtype is %s" %
                        (",".join(check_list), dtype))

    m = data_m
    n = data_n
    k = data_k
    data_shape, weight_shape = get_shape(m, n, k, trans_a, trans_b)
    output_shape = (m, n)

    A = akg.tvm.placeholder(data_shape, name='A', dtype=dtype)
    B = akg.tvm.placeholder(weight_shape, name='B', dtype=dtype)

    ZERO = akg.tvm.const(0.0, dtype=dtype)

    @script
    def matmul_hybrid_f_f(a, b, zero):
        t_1 = allocate((m, k, n), a.dtype, 'local')
        t_2 = output_tensor((m, n), a.dtype)
        for i_m in range(0, m):
            for i_k in range(0, k):
                for i_n in range(0, n):
                    t_1[i_m, i_k, i_n] = a[i_m, i_k] * b[i_k, i_n]
            for i1_n in range(0, n):
                t_2[i_m, i1_n] = zero
            for i1_k in range(0, k):
                for i1_n in range(0, n):
                    t_2[i_m, i1_n] = t_2[i_m, i1_n] + t_1[i_m, i1_k, i1_n]
        return t_2

    @script
    def matmul_hybrid_f_t(a, b, zero):
        t_1 = allocate((m, n, k), a.dtype, 'local')
        t_2 = output_tensor((m, n), a.dtype)
        for i_m in range(0, m):
            for i_n in range(0, n):
                t_2[i_m, i_n] = zero
                for i_k in range(0, k):
                    t_1[i_m, i_n, i_k] = a[i_m, i_k] * b[i_n, i_k]
                    t_2[i_m, i_n] = t_1[i_m, i_n, i_k] + t_2[i_m, i_n]
        return t_2

    @script
    def matmul_hybrid_t_f(a, b, zero):
        t_1 = allocate((m, k, n), a.dtype, 'local')
        t_2 = output_tensor((m, n), a.dtype)
        for i_m in range(0, m):
            for i_k in range(0, k):
                for i_n in range(0, n):
                    t_1[i_m, i_k, i_n] = a[i_k, i_m] * b[i_k, i_n]
            for i1_n in range(0, n):
                t_2[i_m, i1_n] = zero
            for i1_k in range(0, k):
                for i1_n in range(0, n):
                    t_2[i_m, i1_n] = t_2[i_m, i1_n] + t_1[i_m, i1_k, i1_n]
        return t_2

    C = ()

    if trans_a == False and trans_b == False:
        C = matmul_hybrid_f_f(A, B, ZERO)
    elif trans_a == False and trans_b == True:
        C = matmul_hybrid_f_t(A, B, ZERO)
    elif trans_a == True and trans_b == False:
        C = matmul_hybrid_t_f(A, B, ZERO)
    else:
        raise ValueError('Not support both transpose yet')

    forward_s = akg.tvm.create_schedule(C.op)
    op_vars = [A, B, C]

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(forward_s,
                        op_vars,
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)
        source_code = mod.imported_modules[0].get_source()
        utils.create_code(kernel_name, "./", source_code)
        return mod, output_shape
Exemplo n.º 22
0
def maxpool_ad_manual_schedule_no_overlap_all_max(shape,
                                                  kernel,
                                                  stride,
                                                  pad,
                                                  dtype,
                                                  attrs=None,
                                                  polyhedral=False):
    """automatic differentiate of maxpool with manual schedule for no overlap case."""
    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    pad_h, pad_w, _, _ = pad
    batch_size, input_c1, input_h, input_w, input_c0 = shape
    pad_shape = (batch_size, input_c1, input_h + 2 * pad_h,
                 input_w + 2 * pad_w, input_c0)

    def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        in_data = inputs[0]

        if stride_w != kernel_w:
            raise RuntimeError(
                "Only supports kernels with same dimensions as stride size!")
        if stride_h != kernel_h:
            raise RuntimeError(
                "Only supports kernels with same dimensions as stride size!")

        out_broadcast = akg.tvm.compute(
            pad_shape,
            lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h),
                                        akg.tvm.floordiv(w, stride_w), c0),
            name="out_broadcast")

        # copy output to the shape of the padded input, copying the same value for the entire kernel size
        out_broadcast = akg.tvm.compute(
            pad_shape,
            lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h),
                                        akg.tvm.floordiv(w, stride_w), c0),
            name="out_broadcast")

        # copy head to the shape of the padded input, copying the same value for the entire kernel size
        head_broadcast = akg.tvm.compute(
            pad_shape,
            lambda b, c1, h, w, c0: head_(b, c1, akg.tvm.floordiv(h, stride_h),
                                          akg.tvm.floordiv(w, stride_w), c0),
            name="head_broadcast")

        # check if value was a maximum and assign head of that position if it was
        # this is done for all the maximum values within one kernel
        result = akg.tvm.compute(
            in_data.shape,
            lambda b, c1, h, w, c0: akg.tvm.expr.Select(
                in_data(b, c1, h, w, c0) == out_broadcast(
                    b, c1, h + pad_h, w + pad_w, c0),
                head_broadcast(b, c1, h + pad_h, w + pad_w, c0),
                akg.tvm.const(0, dtype=in_data.dtype)),
            name="result")
        return [result]

    out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
    out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1

    out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0)

    # tensor for the input data
    data = akg.tvm.placeholder(shape, dtype, name="input_data")

    # maxpool output
    forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype)

    # adjoint tensor for the differentiation
    head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype)

    # override differentiation computation with custom function
    [dl_ddata
     ] = akg.differentiate(forward, [data],
                           head,
                           None,
                           None,
                           override={forward: ([data], custom_maxpool_fdiff)})

    # schedule for differetiation operation
    s = akg.tvm.create_schedule([dl_ddata.op])

    # get computations
    result = dl_ddata
    forward_broadcast = result.op.input_tensors[1]
    head_broadcast = result.op.input_tensors[2]

    # cache reads and writes
    result_ub = s.cache_write(result, "local.UB")
    data_ub = s.cache_read(data, "local.UB", [result_ub])
    head_ub = s.cache_read(head, "local.UB", [head_broadcast])
    forward_ub = s.cache_read(forward, "local.UB", [forward_broadcast])

    s[head_broadcast].set_scope("local.UB")
    s[forward_broadcast].set_scope("local.UB")

    s[head_ub].compute_at(s[head_broadcast], head_broadcast.op.axis[0])
    s[forward_ub].compute_at(s[forward_broadcast],
                             forward_broadcast.op.axis[0])
    s[data_ub].compute_at(s[result_ub], result_ub.op.axis[0])
    s[forward_broadcast].compute_at(s[result_ub], result_ub.op.axis[0])
    s[head_broadcast].compute_at(s[result_ub], result_ub.op.axis[0])

    _, c1, h, _, _ = result.op.axis

    if input_h + 2 * pad_h > 32 or input_w + 2 * pad_w > 32:
        h_outer, _ = s[result].split(h, 4)
        s[result_ub].compute_at(s[result], h_outer)
    else:
        s[result_ub].compute_at(s[result], c1)

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [head, data, forward, dl_ddata],
                        "cce",
                        name="maxpool_ad_manual_schedule_no_overlap_all_max",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "maxpool_ad_manual_schedule_no_overlap_all_max"
        utils.create_cce(kernel_name, './', source_code)
    return mod
Exemplo n.º 23
0
def maxpool_ad_manual_schedule_all_max(shape,
                                       kernel,
                                       stride,
                                       pad,
                                       dtype,
                                       polyhedral=True,
                                       attrs=None):
    """automatic differentiate of maxpool with manual schedule for all maximum."""
    kernel_h, kernel_w = kernel
    stride_h, stride_w = stride
    pad_h, pad_w, _, _ = pad
    batch_size, input_c1, input_h, input_w, input_c0 = shape
    pad_shape = (batch_size, input_c1, input_h + 2 * pad_h,
                 input_w + 2 * pad_w, input_c0)
    out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1
    out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1
    out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0)

    def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        in_data = inputs[0]

        data_separated_by_windows = (kernel_h, kernel_w, batch_size, input_c1,
                                     out_size_h, out_size_w, input_c0)

        pad_data = akg.tvm.compute(
            pad_shape,
            lambda b, c1, h, w, c0: akg.tvm.expr.Select(
                akg.tvm.all(h >= pad_h, h < input_h + pad_h, w >= pad_w, w <
                            input_w + pad_w),
                in_data(b, c1, h - pad_h, w - pad_w, c0),
                akg.tvm.const(0.0, dtype=dtype)),
            name="pad_data")

        data_reshaped = akg.tvm.compute(
            data_separated_by_windows,
            lambda wh, ww, b, c1, oh, ow, c0: pad_data(
                b, c1, oh * stride_h + wh, ow * stride_w + ww, c0),
            name="data_reshaped")

        max_broadcast = akg.tvm.compute(
            data_separated_by_windows,
            lambda wh, ww, b, c1, oh, ow, c0: out(b, c1, oh, ow, c0),
            name="max_broadcast")

        equal = akg.tvm.compute(
            data_separated_by_windows,
            lambda wh, ww, b, c1, oh, ow, c0: akg.tvm.expr.Select(
                max_broadcast(wh, ww, b, c1, oh, ow, c0) == data_reshaped(
                    wh, ww, b, c1, oh, ow, c0), head_(b, c1, oh, ow, c0),
                akg.tvm.const(0.0, dtype=dtype)),
            name="equal")

        data_reorg = akg.tvm.compute(
            (out_size_h, out_size_w, batch_size, input_c1, input_h + 2 * pad_h,
             input_w + 2 * pad_w, input_c0),
            lambda oh, ow, b, c1, h, w, c0: akg.tvm.expr.Select(
                akg.tvm.any(h < oh * stride_h, h > oh * stride_h + kernel_h -
                            1, w < ow * stride_w, w > ow * stride_w + kernel_w
                            - 1), akg.tvm.const(0, dtype=dtype),
                equal(h - oh * stride_h, w - ow * stride_w, b, c1, oh, ow, c0)
            ),
            name="data_reorg")

        result_pad = akg.topi.sum(data_reorg, [0, 1])

        result = akg.tvm.compute(shape,
                                 lambda b, c1, h, w, c0: result_pad(
                                     b, c1, h + pad_h, w + pad_w, c0),
                                 name="result")

        return [result]

    # tensor for the input data
    data = akg.tvm.placeholder(shape, dtype, name="input_data")

    # maxpool output
    forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype)

    # adjoint tensor for the differentiation
    head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype)

    # override differentiation computation with custom function
    [dl_ddata
     ] = akg.differentiate(forward, [data],
                           head,
                           None,
                           None,
                           override={forward: ([data], custom_maxpool_fdiff)})

    # schedule for differetiation operation
    s = akg.tvm.create_schedule([dl_ddata.op])

    # get computations
    result = dl_ddata
    result_pad = result.op.input_tensors[0]
    data_reorg = result_pad.op.input_tensors[0]
    equal = data_reorg.op.input_tensors[0]
    max_broadcast = equal.op.input_tensors[0]
    data_reshaped = equal.op.input_tensors[1]
    pad_data = data_reshaped.op.input_tensors[0]

    data_ub = s.cache_read(data, "local.UB", [pad_data])
    head_ub = s.cache_read(head, "local.UB", [equal])
    forward_ub = s.cache_read(forward, "local.UB", [max_broadcast])
    result_ub = s.cache_write(result, "local.UB")

    s[max_broadcast].set_scope("local.UB")
    s[data_reshaped].set_scope("local.UB")
    s[pad_data].set_scope("local.UB")
    s[equal].set_scope("local.UB")
    s[data_reorg].set_scope("local.UB")
    s[result_pad].set_scope("local.UB")

    s[data_ub].compute_inline()
    s[result_ub].compute_inline()
    s[pad_data].compute_inline()

    # equal dependencies
    s[forward_ub].compute_at(s[equal], equal.op.axis[0])
    s[max_broadcast].compute_at(s[equal], equal.op.axis[0])
    s[data_reshaped].compute_at(s[equal], equal.op.axis[0])
    s[head_ub].compute_at(s[equal], equal.op.axis[0])

    s[equal].compute_at(s[result_pad], result_pad.op.axis[0])

    # result dependencies
    s[data_reorg].compute_inline()
    b, c1, h, w, c0 = result_pad.op.axis
    oh, ow = result_pad.op.reduce_axis
    s[result_pad].reorder(oh, ow, b, c1, h, w, c0)
    # s[result_pad].compute_at(s[result], result.op.axis[1])

    b, c1, h, w, c0 = result.op.axis
    h_out, _ = s[result].split(h, stride_h)
    s[result_pad].compute_at(s[result], h_out)

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [head, data, forward, dl_ddata],
                        "cce",
                        name="maxpool_ad_manual_schedule_all_max",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "maxpool_ad_manual_schedule_all_max"
        utils.create_cce(kernel_name, './', source_code)
    return mod
Exemplo n.º 24
0
def reduce_max_ad_optimized_manual_schedule(input_shape,
                                            dtype,
                                            axis,
                                            keepdims,
                                            polyhedral=True,
                                            attrs=None):
    def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array):
        data_ = inputs[0]
        shape = data_.shape
        # reduces maximum value for each column
        max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True)
        # copies reduced values to get the original shape
        max_broadcast = akg.lang.cce.broadcast(max_, shape)
        # head broadcast is needed to generate correct cce code for the selection operation
        head_broadcast = akg.tvm.compute(
            shape, lambda *indices: head_(*get_reduced_indices(
                *indices, axis=axis, keepdims=keepdims)))
        # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output
        max_values_and_zeros = akg.tvm.compute(
            shape,
            lambda *indices: akg.tvm.expr.Select(
                data_(*indices) == max_broadcast(*indices),
                head_broadcast(*indices), akg.tvm.const(0, dtype='float16')),
            name="reduce_max_ad2")
        # cast data back to the original dtype
        if dtype != 'float16':
            return [cast(max_values_and_zeros, dtype)]
        else:
            return [max_values_and_zeros]

    # tensor for the input data
    data = akg.tvm.placeholder(input_shape, dtype, name="input_data")

    # computation of reduce max
    # not used on the schedule because this is the diferentiation op
    l = reduce_max.reduce_max(data, axis, keepdims)

    # adjoint tensor for the differentiation
    head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype)

    # cast input data
    if dtype != 'float16':
        data_cast = cast(data, "float16")
        head_cast = cast(head, "float16")
    else:
        data_cast = data
        head_cast = head

    # override differentiation computation with custom function
    [dl_ddata] = akg.differentiate(
        l, [data_cast],
        head_cast,
        None,
        None,
        override={l: ([data_cast], custom_reduce_max_fdiff)})

    # get tensors from custom function
    if dtype != 'float16':
        max_values_and_zeros = dl_ddata.op.input_tensors[0]
        max_broadcast = max_values_and_zeros.op.input_tensors[1]
        max_ = max_broadcast.op.input_tensors[0]
        head_broadcast = max_values_and_zeros.op.input_tensors[2]
    else:
        max_broadcast = dl_ddata.op.input_tensors[1]
        max_ = max_broadcast.op.input_tensors[0]
        head_broadcast = dl_ddata.op.input_tensors[2]

    # schedule for differetiation operation
    # inputs: data and head
    s = akg.tvm.create_schedule([dl_ddata.op])

    # cache reads of inputs
    if dtype != 'float16':
        head_ub = s.cache_read(head, "local.UB", [head_cast])
        data_ub = s.cache_read(data, "local.UB", [data_cast])
    else:
        # no cast operation
        head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast])
        data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata])

    # cache write for the output
    dl_ddata_ub = s.cache_write(dl_ddata, "local.UB")

    # get tiling attributes
    if attrs is None:
        raise Exception('attrs is None')
    tiling_factors = attrs['tile']
    split_iterators = []
    assert len(tiling_factors) == len(dl_ddata.shape)
    # split the final compute and save the iterators
    for index, factor in enumerate(tiling_factors):
        split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index],
                                                 factor))

    # get iterators
    iterator1 = split_iterators[0][0]

    # move computation of when there is a cast
    if dtype != "float16":
        s[data_cast].compute_at(s[dl_ddata], iterator1)
        s[data_cast].set_scope("local.UB")
        s[head_cast].compute_at(s[dl_ddata], iterator1)
        s[head_cast].set_scope("local.UB")
        s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1)
        s[max_values_and_zeros].set_scope("local.UB")

    # move cache reads and writes
    s[data_ub].compute_at(s[dl_ddata], iterator1)
    s[head_ub].compute_at(s[dl_ddata], iterator1)
    s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1)

    # move computation of the diferentiation
    s[max_].compute_at(s[dl_ddata], iterator1)
    s[max_].set_scope("local.UB")
    s[max_broadcast].compute_at(s[dl_ddata], iterator1)
    s[max_broadcast].set_scope("local.UB")
    s[head_broadcast].compute_at(s[dl_ddata], iterator1)
    s[head_broadcast].set_scope("local.UB")

    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [head, data, dl_ddata],
                        "cce",
                        name="reduce_max_ad_manual_schedule",
                        attrs=attrs,
                        polyhedral=polyhedral)
        source_code = mod.imported_modules[0].get_source()
        kernel_name = "reduce_max_ad_manual_schedule"
        utils.create_cce(kernel_name, './', source_code)
    return mod
Exemplo n.º 25
0
def fc(fMapBatch,
       weight,
       fc_dtype,
       block_size,
       attrs,
       kernel_name="Fully_Connected"):
    """
    Computes full connection.

    Args:
        fMapBatch(akg.tvm.Tensor): Should be a 4D tensor.
        weight(akg.tvm.Tensor): Should be a 4D tensor of same type as fMapBatch.
        fc_dtype(str): Specifies data type of input tensors.
        block_size(int): Block size.
        attrs(dicts): Attributes.
        kernel_name(str): Kernel name.

    Returns:
        akg.tvm.Tensor of same type as input tensors.
    """
    # NCHW
    f_n, f_c, f_h, f_w = fMapBatch.shape
    w_n, w_c, w_h, w_w = weight.shape

    if f_c != w_c or f_h != w_h or f_w != w_w or w_n < 32:
        raise RuntimeError("invalid input shape")
    f_shape_nc1hwc0 = (f_n, f_c // block_size, f_h, f_w, block_size)

    w_shape_fractal = (w_c // block_size * w_h * w_w, w_n // block_size,
                       block_size, block_size)

    A = akg.tvm.placeholder(f_shape_nc1hwc0, dtype=fc_dtype, name='fmap')
    B = akg.tvm.placeholder(w_shape_fractal, dtype=fc_dtype, name='weight')

    out_shape_nc1hwc0 = (f_n, w_n // block_size, 1, 1, block_size)

    weight_shape_nc1hwc0 = (w_n, w_c // block_size, w_h, w_w, block_size)

    _, k_c1, k_h, k_w, k_c0 = weight_shape_nc1hwc0

    kc1 = akg.tvm.reduce_axis((0, k_c1), name='kc1')
    kh = akg.tvm.reduce_axis((0, k_h), name='kh')
    kw = akg.tvm.reduce_axis((0, k_w), name='kw')
    kc0 = akg.tvm.reduce_axis((0, k_c0), name='kc0')

    res = akg.tvm.compute(out_shape_nc1hwc0,
                          lambda n, c1, h, w, c0: akg.lang.cce.mmad(
                              A[n, kc1, (h + kh), (w + kw), kc0] * B[
                                  (kc1 * k_h + kh) * k_w + kw, c1, c0, kc0],
                              axis=[kc1, kh, kw, kc0]),
                          name="res")

    s = akg.tvm.create_schedule(res.op)
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        mod = akg.build(s, [A, B, res],
                        "cce",
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=True)

    return mod
Exemplo n.º 26
0
def op_build(op_func,
             input_shapes,
             input_types,
             op_attrs=None,
             kernel_name="",
             attrs=None,
             log_cce=False,
             dump_ir=True,
             dump_cce=True,
             polyhedral=True,
             tuning=False):
    """
    Return module built from op_func with given inputs.

    Args:
        op_func (function returning an op or (op, [op_vars])): The op build function.
        input_shapes(iterable of iterable of int): the dim sizes for input for op.
        input_types (iterable of iterable of str): the dtypes for each input.
        op_attrs (list or tuple): extra attributes for the op.
        kernel_name (str): name of op.
        attrs (dict): tiling parameter.
        log_cce (bool): False by default.
        dump_ir (bool): True by default.
        dump_cce (bool): False by default.
        polyhedral (bool): True by default.
        tuning (bool): False by default.

    Return:
        module.
    """
    inputs = []
    set_dim_key = ""
    shape_params = []
    for i, (shape, dtype) in enumerate(zip(input_shapes, input_types)):
        if isinstance(shape, (list, tuple)) and shape and isinstance(
                shape[0], (list, tuple)):
            tmp_input = []
            for j, tmp_shape in enumerate(shape):
                tmp_input.append(
                    akg.tvm.placeholder(tmp_shape, dtype,
                                        "input_%d_%d" % (i + 1, j + 1)))
                for tmp in tmp_shape:
                    if isinstance(tmp, akg.tvm.expr.Var):
                        shape_params.append(tmp)
            inputs.append(tmp_input)
        elif isinstance(shape, (list, tuple)) and shape and isinstance(
                shape[0], akg.tvm.expr.Var):
            inputs.append(
                akg.tvm.placeholder(shape, dtype, "input_%d" % (i + 1)))
            for tmp_shape in shape:
                if isinstance(tmp_shape, akg.tvm.expr.Var):
                    shape_params.append(tmp_shape)
        elif isinstance(shape, akg.tvm.tensor.Tensor):
            inputs.append(shape)
            for tmp_shape in shape.shape:
                shape_params.append(tmp_shape)
        else:
            inputs.append(
                akg.tvm.placeholder(shape, dtype, "input_%d" % (i + 1)))
    attrs_params = []
    if op_attrs is not None:
        args = inputs + op_attrs
        for tmp_attr in op_attrs:
            if isinstance(tmp_attr, (list, tuple)) and tmp_attr and isinstance(
                    tmp_attr[0], akg.tvm.expr.Var):
                for attr_param in tmp_attr:
                    if isinstance(attr_param, akg.tvm.expr.Var):
                        attrs_params.append(attr_param)
            elif isinstance(tmp_attr, akg.tvm.expr.Var):
                attrs_params.append(tmp_attr)
    else:
        args = inputs

    # backup inputs because the tensor names may be updated inside op_func
    inputs_backup = recursive_copy(inputs)

    output = op_func(*args)

    # restore inputs to make sure that tensor names are not changed by op_func
    inputs = inputs_backup

    if attrs is None or 'dim' not in attrs or not attrs['dim']:
        dim_info = ""
        if attrs is None:
            attrs = dict()

        if op_func.__name__ in ct_util.set_dim_func_map.keys():
            value = ct_util.set_dim_func_map[op_func.__name__]
            if inspect.isfunction(value):
                dim_info = value(*args)
            elif isinstance(value, dict):
                key = []
                key.append(ft_util.convert_to_list(input_shapes))
                key.append(ft_util.convert_to_list(input_types))
                if op_attrs is not None:
                    key.append(op_attrs)
                key = str(tuple(key))

                if key in value.keys():
                    dim_info = ct_util.set_dims(value[key])
            else:
                raise RuntimeError(
                    "Registered set_dim_map is invalid. Must be a function or a dict!"
                )
        if isinstance(dim_info, (list, tuple)):
            dim_info = dim_info[0]

        attrs['dim'] = dim_info

    compute_func = None  # func which is defined in dsl for doing compute_inline or other
    sch_tmpl = None
    if isinstance(output, (list, tuple)):
        from inspect import isfunction
        new_outputs = []
        for elem in output:
            if isfunction(elem):
                compute_func = elem
            elif isinstance(elem, dict):
                for key, value in elem.items():
                    if key not in attrs or not attrs[key]:
                        attrs[key] = value
            elif isinstance(elem, (list, tuple)):
                new_outputs += elem
            else:
                new_outputs.append(elem)

        output = new_outputs
    elif isinstance(output, dict):
        sch_tmpl = output
        output = sch_tmpl['output']
    binds = None if not attrs else attrs.pop(BINDS, None)

    op_var = []
    for xx in inputs:
        if isinstance(xx, list):
            for x in xx:
                op_var.append(x)
        else:
            op_var.append(xx)
    shape_var = []
    if attrs_params:
        [shape_var.append(i) for i in attrs_params if i not in shape_var]
    [shape_var.append(i) for i in shape_params if i not in shape_var]
    if isinstance(output, (list, tuple)):
        op_var = op_var + [i for i in output if TensorUtils.is_output_value(i)]
    else:
        if TensorUtils.is_output_value(output):
            op_var = op_var + [output]

    if sch_tmpl != None:
        assert (sch_tmpl['target'] == 'cuda')
        kernel_name = kernel_name if kernel_name != "" else sch_tmpl['op_name']
        with akg.tvm.target.cuda() as target:
            s = sch_tmpl['schedule'](sch_tmpl['output'])
            with akg.tvm.build_config(dump_pass_ir=True):
                mod = akg.tvm.build(s,
                                    op_var,
                                    target,
                                    target_host='stackvm',
                                    name=kernel_name)
                dump_cuda_meta.dump(mod, kernel_name, s, op_var)
                return mod

    if isinstance(output, (list, tuple)):
        tmp = []
        for x in list(output):
            if isinstance(x, tuple):
                tmp.append(x[0].op)
            else:
                tmp.append(x.op)
        s = akg.tvm.create_schedule(tmp)
    else:
        s = akg.tvm.create_schedule(output.op)
    if compute_func is not None:
        compute_func(s)
        polyhedral = False
    kernel_name = kernel_name if kernel_name != "" else op_func.__name__
    mode = get_runtime_mode()
    level = attrs.get("help_tiling")
    if tuning or (level is not None and level > help_tiling_level['None']):
        if op_func.__name__ in ct_util.set_dim_func_map.keys():
            func_ = ct_util.set_dim_func_map[op_func.__name__]
            if inspect.isfunction(func_):
                set_dim_key = func_(*args)[1]
        elif op_func.__name__ in ct_util.gen_key_func_map.keys():
            func_ = ct_util.gen_key_func_map[op_func.__name__]
            if inspect.isfunction(func_):
                set_dim_key = func_(*args)
        with akg.build_config(add_lower_pass=cce.debug_mode(0),
                              dump_pass_ir=True):
            spaces = akg.lower(s,
                               op_var,
                               name=kernel_name,
                               attrs=attrs,
                               polyhedral=polyhedral,
                               tuning=tuning)
            if set_dim_key == "":
                set_dim_key = str(args)
            return spaces, set_dim_key

    if mode == "cpu":
        mod = akg.tvm.build(s, op_var, "llvm")
        if not os.path.isdir("./cpu/ir/"):
            os.makedirs("./cpu/ir/")
        with os.fdopen(
                os.open("./cpu/ir/" + kernel_name + ".cc",
                        os.O_WRONLY | os.O_CREAT, 0o400), 'w') as irf:
            irf.write(akg.tvm.lower(s, op_var, shape_var, simple_mode=True))
        return mod
    with akg.build_config(add_lower_pass=cce.debug_mode(0),
                          dump_pass_ir=dump_ir):
        mod = akg.build(s,
                        op_var,
                        "cce",
                        shape_var,
                        name=kernel_name,
                        attrs=attrs,
                        polyhedral=polyhedral,
                        binds=binds)
        if mod is None:
            return None
        source_code = mod.imported_modules[0].get_source()
    if log_cce:
        logging.debug("#################cce code####################")
        logging.debug(source_code)
    if dump_cce:
        cce_path = "./"
        create_cce(kernel_name, cce_path, source_code)

    return mod