示例#1
0
def pool_dsp_schedule(outs, layout):
    """Schedule function for v7e-m DSP instructions of pooling."""
    s = te.create_schedule([x.op for x in outs])

    def _callback(op):
        in_dtype = op.input_tensors[0].dtype
        if "pool_max" in op.tag:
            if in_dtype != "int8":
                logger.warning("Does not have micro-kernel for %s maxpool.",
                               in_dtype)
            elif layout == "NWC":
                schedule_maxpool_1d_nwc(s, op)
            elif layout == "NHWC":
                schedule_maxpool_2d_nhwc(s, op)
        elif "pool_sum" in op.tag:
            if in_dtype != "int16":
                logger.warning("Does not have micro-kernel for %s avgpool.",
                               in_dtype)
            elif layout == "NCW":
                schedule_avgpool_1d_ncw(s, op)
            elif layout == "NCHW":
                schedule_avgpool_2d_nchw(s, op)

    traverse_inline(s, outs[-1].op, _callback)
    return s
示例#2
0
def schedule_conv2d_winograd_impl(cfg, outs, tag, pre_computed=False):
    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
    s = te.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == tag:
            schedule_conv2d_winograd(cfg, s, op.output(0), pre_computed=pre_computed)

    traverse_inline(s, outs[0].op, _callback)
    return s
示例#3
0
def conv2d_nhwc_dsp_schedule(cfg, outs):
    """Schedule function for v7e-m DSP instructions of conv2d."""
    sched = te.create_schedule([x.op for x in outs])

    def _callback(op):
        if "conv2d_nhwc" not in op.tag:
            return

        # extract tensors
        output = op.output(0)
        conv = op
        data_vec = conv.input_tensors[0]
        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
        last = outs[0]  # pylint: disable=unused-variable

        source_index_w = output.op.body[0].source[0].a.value.indices[2].a
        stride_w = source_index_w.b.value if isinstance(source_index_w,
                                                        Mul) else 1

        # tile reduction axes
        n, oh, ow, co = sched[conv].op.axis
        kh, kw, ci = sched[conv].op.reduce_axis

        M = cfg["tile_ow"].size[-1]
        K = cfg["tile_ci"].size[-1]
        N = cfg["tile_co"].size[-1]

        owo, owi = cfg["tile_ow"].apply(sched, conv, ow)
        cio, cii = cfg["tile_ci"].apply(sched, conv, ci)
        coo, coi = cfg["tile_co"].apply(sched, conv, co)

        cfg["reorder_0_simd"].apply(
            sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii])

        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype,
                                          output.dtype, stride_w)
        sched[output].tensorize(owi, gemm)
        sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))

        # this is the scope to attach global config inside this kernel
        kernel_scope = n

        # tune unroll
        sched[output].pragma(kernel_scope, "auto_unroll_max_step",
                             cfg["auto_unroll_max_step"].val)
        sched[output].pragma(kernel_scope, "unroll_explicit",
                             cfg["unroll_explicit"].val)

    traverse_inline(sched, outs[-1].op, _callback)
    return sched
示例#4
0
def dense_dsp_schedule(outs):
    """Schedule function for v7e-m DSP instructions of dense."""
    sched = te.create_schedule([x.op for x in outs])

    def _callback(op):
        if "dense" not in op.tag:
            return

        # extract tensors
        output = op.output(0)
        dense = op
        data_vec = dense.input_tensors[0]
        M, K = data_vec.shape
        N, _ = dense.input_tensors[1].shape

        n, _ = sched[dense].op.axis
        no, ni = sched[dense].split(n, nparts=1)

        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype)
        sched[output].tensorize(ni, gemm)
        sched[output].pragma(no, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))

    traverse_inline(sched, outs[-1].op, _callback)
    return sched
示例#5
0
def conv2d_direct_nhwc_schedule(cfg, outs):
    """Schedule function for directly-scheduled conv2d on NHWC layout."""
    sched = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if "conv2d_nhwc" not in op.tag:
            return

        ### extract tensors ###
        output = op.output(0)
        conv = op
        data_vec = conv.input_tensors[0]
        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
        last = outs[0]  # pylint: disable=unused-variable

        # tile reduction axes
        n, oh, ow, co = sched[conv].op.axis
        kh, kw, ci = sched[conv].op.reduce_axis
        # NOTE we can't inline data padding in the SIMD path, because it
        # introduces conditionals in the inner loop.
        data_pad = data_vec.op
        sched[data_pad].compute_inline()

        co, vc = cfg["tile_co"].apply(sched, conv, co)
        oh, vh = cfg["tile_oh"].apply(sched, conv, oh)
        ow, vw = cfg["tile_ow"].apply(sched, conv, ow)
        cfg["reorder_0"].apply(sched, conv,
                               [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
        cfg["ann_reduce"].apply(
            sched,
            conv,
            [kh, kw],
            axis_lens=[
                get_const_int(kh.dom.extent),
                get_const_int(kw.dom.extent)
            ],
            max_unroll=8,
            cfg=cfg,
        )
        cfg["ann_spatial"].apply(
            sched,
            conv,
            [vh, vw, vc],
            axis_lens=[
                cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1],
                cfg["tile_co"].size[-1]
            ],
            max_unroll=8,
            cfg=cfg,
        )

        kernel_scope = n  # this is the scope to attach global config inside this kernel

        # tune unroll
        sched[output].pragma(kernel_scope, "auto_unroll_max_step",
                             cfg["auto_unroll_max_step"].val)
        sched[output].pragma(kernel_scope, "unroll_explicit",
                             cfg["unroll_explicit"].val)

    traverse_inline(sched, outs[-1].op, _callback)
    return sched