예제 #1
0
def _conv_schedule_asm(outs):
    """_conv_schedule_asm"""
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if "asm_conv2d_output" in op.tag:
            # schedule conv2d
            output = op.output(0)
            conv = op.input_tensors[0]

            sidx = 0
            if conv.op.input_tensors[0].name == "attr":
                sidx = 1
            data_vec = conv.op.input_tensors[sidx]
            data_pad = data_vec.op.input_tensors[0]
            s[data_pad].compute_inline()

            kernel_vec = conv.op.input_tensors[sidx + 1]
            if kernel_vec.op.name == "kernel_vec":
                kernel = kernel_vec.op.input_tensors[0]
            else:
                kernel = kernel_vec
            if (isinstance(kernel.op, tvm.tensor.ComputeOp)
                    and "dilate" in kernel.op.tag):
                s[kernel].compute_inline()

            if conv.op.input_tensors[0].name == "attr":
                _schedule_asm(s, data_vec, kernel_vec, conv, output, outs[0])
            else:
                _schedule_asm(s, data_vec, kernel_vec, conv, output, outs[0])

    traverse_inline(s, outs[0].op, _callback)
    return s
def schedule_sparse_dense_cuda_allreduce_autotune(cfg, outs):
    """Create schedule for sparse dense"""
    s = te.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == "sparse_dense_bsrmm":
            y_bsrmm = op.input_tensors[0]
            w_indptr = y_bsrmm.op.input_tensors[0]
            assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block"
            y_reshape = op
            (m, num_blocks, b_r) = s[y_bsrmm].op.axis
            bs_r = get_const_int(b_r.dom.extent)
            (elem_idx, c) = s[y_bsrmm].op.reduce_axis
            
            (m_o, n_o) = s[y_reshape].op.axis
            s[y_reshape].bind(m_o, te.thread_axis("blockIdx.x"))
            s[y_reshape].bind(n_o, te.thread_axis("blockIdx.y"))
            s[y_bsrmm].compute_at(s[y_reshape], n_o)

            thread_x = te.thread_axis("threadIdx.x")

            cfg.define_split("tile_c", c, num_outputs=2)
            co, ci = cfg['tile_c'].apply(s, y_bsrmm, c)

            y_bsrmm_factored = s.rfactor(y_bsrmm, ci)
            tx = s[y_bsrmm].op.reduce_axis[0]
            s[y_bsrmm].bind(tx, thread_x)
            s[y_bsrmm_factored].compute_at(s[y_bsrmm], tx)
            s[y_bsrmm].set_store_predicate(thread_x.var.equal(0))
            s[y_reshape].set_store_predicate(thread_x.var.equal(0))

    traverse_inline(s, outs[0].op, _callback)
    return s
예제 #3
0
def _conv_schedule_deconv(cfg, outs):
    """schedule_conv2d_nchw_arm_cpu_deconv inner implementation"""
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if "deconv_conv2d_output" in op.tag:
            # schedule conv2d
            output = op.output(0)
            conv = op.input_tensors[0]

            sidx = 0
            if conv.op.input_tensors[0].name == "attr":
                sidx = 1
            data_vec = conv.op.input_tensors[sidx]

            kernel_vec = conv.op.input_tensors[sidx + 1]
            if kernel_vec.op.name == "kernel_vec":
                kernel = kernel_vec.op.input_tensors[0]
            else:
                kernel = kernel_vec
            if (isinstance(kernel.op, tvm.tensor.ComputeOp)
                    and "dilate" in kernel.op.tag):
                s[kernel].compute_inline()

            _schedule_deconv(cfg, s, data_vec, kernel_vec, conv, output,
                             outs[0])

    traverse_inline(s, outs[0].op, _callback)
    return s
예제 #4
0
파일: gpu.py 프로젝트: were/UNIT
def conv2d_NCHW16c_OHWI16o_schedule(attrs, outs, target):

    from topi.util import traverse_inline
    sch = te.create_schedule([i.op for i in outs])
    output = outs[0].op

    def callback(op):
        nonlocal sch
        if len(list(op.reduce_axis)):
            a, b = op.input_tensors
            tune.ashape = get_const_tuple(a.shape)
            tune.bshape = get_const_tuple(b.shape)

            conv = op.output(0)
            n, c, h, w, _ = get_const_tuple(conv.shape)
            stride_h, stride_w = attrs.get_int_tuple('strides')
            tune.strides = (stride_h, stride_w)
            ky = tune.ashape, tune.bshape, (stride_h, stride_w)
            if tune.enable and ky in tune.cuda_kernel.keys():
                tune.splitk = int(tune.cuda_kernel[ky])
            if w % 32 == 0:
                _conv2d_schedule_wdim(sch, conv, output, stride_h, stride_w)
            else:
                assert h * w % 32 == 0 and 32 % w == 0
                _conv2d_schedule_fused(sch, conv, output, stride_h, stride_w)

    traverse_inline(sch, output, callback)

    tune.splitk = None

    return sch
예제 #5
0
def _schedule_custom_dense_pack(cfg, outs):
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if "dense_pack" in op.tag:
            _schedule_custom_dense_pack_template(cfg, s, op.output(0))

    traverse_inline(s, outs[0].op, _callback)
    return s
예제 #6
0
def conv2d_direct_nhwc_schedule(cfg, outs):
    """Schedule function for directly-scheduled conv2d on NHWC layout."""
    sched = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if 'conv2d_nhwc' not in op.tag:
            return

        ### extract tensors ###
        output = op.output(0)
        conv = op
        data_vec = conv.input_tensors[0]
        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
        last = outs[0]  # pylint: disable=unused-variable

        # tile reduction axes
        n, oh, ow, co = sched[conv].op.axis
        kh, kw, ci = sched[conv].op.reduce_axis
        # NOTE we can't inline data padding in the SIMD path, because it
        # introduces conditionals in the inner loop.
        data_pad = data_vec.op
        sched[data_pad].compute_inline()

        co, vc = cfg['tile_co'].apply(sched, conv, co)
        oh, vh = cfg['tile_oh'].apply(sched, conv, oh)
        ow, vw = cfg['tile_ow'].apply(sched, conv, ow)
        cfg['reorder_0'].apply(sched, conv,
                               [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
        cfg['ann_reduce'].apply(sched,
                                conv, [kh, kw],
                                axis_lens=[
                                    get_const_int(kh.dom.extent),
                                    get_const_int(kw.dom.extent)
                                ],
                                max_unroll=8,
                                cfg=cfg)
        cfg['ann_spatial'].apply(sched,
                                 conv, [vh, vw, vc],
                                 axis_lens=[
                                     cfg['tile_oh'].size[-1],
                                     cfg['tile_ow'].size[-1],
                                     cfg['tile_co'].size[-1]
                                 ],
                                 max_unroll=8,
                                 cfg=cfg)

        kernel_scope = n  # this is the scope to attach global config inside this kernel

        # tune unroll
        sched[output].pragma(kernel_scope, 'auto_unroll_max_step',
                             cfg['auto_unroll_max_step'].val)
        sched[output].pragma(kernel_scope, 'unroll_explicit',
                             cfg['unroll_explicit'].val)

    traverse_inline(sched, outs[-1].op, _callback)
    return sched
def schedule_sparse_dense_cuda_baseline(outs):
    """Create schedule for sparse dense"""
    s = te.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == "sparse_dense_bsrmm":
            y_bsrmm = op.input_tensors[0]
            assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block"
            y_reshape = op
            (m, num_blocks, b_r) = s[y_bsrmm].op.axis
            bs_r = get_const_int(b_r.dom.extent)
            (elem_idx, c) = s[y_bsrmm].op.reduce_axis
            
            s[y_reshape].bind(s[y_reshape].op.axis[0], te.thread_axis("blockIdx.x"))
            s[y_reshape].bind(s[y_reshape].op.axis[1], te.thread_axis("threadIdx.x"))
            s[y_bsrmm].compute_at(s[y_reshape], s[y_reshape].op.axis[1])

    traverse_inline(s, outs[0].op, _callback)
    return s
예제 #8
0
def conv2d_direct_simd_nhwc_schedule(cfg, outs):
    """Schedule function for Cortex-M7 SIMD implementation of conv2d."""
    sched = te.create_schedule([x.op for x in outs])

    def _callback(op):
        if 'conv2d_nhwc' not in op.tag:
            return

        # extract tensors
        output = op.output(0)
        conv = op
        data_vec = conv.input_tensors[0]
        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
        last = outs[0]  # pylint: disable=unused-variable

        # tile reduction axes
        n, oh, ow, co = sched[conv].op.axis
        kh, kw, ci = sched[conv].op.reduce_axis

        M = cfg['tile_ow'].size[-1]
        K = cfg['tile_ci'].size[-1]
        N = cfg['tile_co'].size[-1]

        owo, owi = cfg['tile_ow'].apply(sched, conv, ow)
        cio, cii = cfg['tile_ci'].apply(sched, conv, ci)
        coo, coi = cfg['tile_co'].apply(sched, conv, co)

        cfg['reorder_0_simd'].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii])

        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype)
        sched[output].tensorize(owi, gemm)
        sched[output].pragma(n, 'import_c', gemm_MxKxN_impl(M, K, N, uniq_id))

        # this is the scope to attach global config inside this kernel
        kernel_scope = n

        # tune unroll
        sched[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
        sched[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)

    traverse_inline(sched, outs[-1].op, _callback)
    return sched
예제 #9
0
def conv2d_NCHW16c_OHWI16o_schedule(attrs, outs, target):

    from topi.util import traverse_inline
    sch = te.create_schedule([i.op for i in outs])
    output = outs[0].op

    def callback(op):
        nonlocal sch
        if len(list(op.reduce_axis)):
            a, b = op.input_tensors

            conv = op.output(0)
            n, c, h, w, _ = get_const_tuple(conv.shape)
            stride_h, stride_w = attrs.get_int_tuple('strides')
            if w % 32 == 0:
                _conv2d_schedule_wdim(sch, conv, output, stride_h, stride_w)
            else:
                assert h * w % 32 == 0 and 32 % w == 0
                _conv2d_schedule_fused(sch, conv, output, stride_h, stride_w)

    traverse_inline(sch, output, callback)

    return sch
예제 #10
0
def _matmul_schedule_asm(cfg, outs):
    """schedule_conv2d_nchw schedule implementation"""
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if "asm_matmul_output" in op.tag:
            # schedule conv2d
            output = op.output(0)
            mat = op.input_tensors[0]

            sidx = 0
            if mat.op.input_tensors[0].name == "attr":
                sidx = 1
            a_vec = mat.op.input_tensors[sidx]
            b_vec = mat.op.input_tensors[sidx + 1]

            def recurs_inline(a_):
                if a_.op.input_tensors:
                    a1 = a_.op.input_tensors[0]
                    if a1.shape == a_.shape:
                        s[a1].compute_inline()
                    recurs_inline(a1)

            def recurs_inline_(a_):
                if isinstance(a_, tvm.tensor.ComputeOp):
                    if a_.op.input_tensors:
                        a1 = a_.op.input_tensors[0]
                        s[a1].compute_inline()
                        recurs_inline_(a1)

            recurs_inline_(a_vec)
            recurs_inline_(b_vec)

            _schedule_asm(cfg, s, a_vec, b_vec, mat, output, outs[0])

    traverse_inline(s, outs[0].op, _callback)
    return s
예제 #11
0
def _depthwise_schedule_spatial_pack(cfg, outs):
    """schedule_depthwise_conv2d_nchw_arm's inner implement"""
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    s = tvm.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == "spatial_depthwise_conv_nchw_output":
            output = op.output(0)
            conv = op.input_tensors[0]
            data_vec = conv.op.input_tensors[0]
            kernel_vec = conv.op.input_tensors[1]
            if kernel_vec.op.name == "kernel_vec":
                kernel = kernel_vec.op.input_tensors[0]
            else:
                kernel = kernel_vec
            if isinstance(kernel.op,
                          tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
                s[kernel].compute_inline()

            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output,
                                   outs[0])

    traverse_inline(s, outs[0].op, _callback)
    return s
예제 #12
0
파일: cpu.py 프로젝트: were/UNIT
def schedule(outs, strides, pattern, pragma, max_threads):

    from topi.util import traverse_inline
    sch = tvm.te.create_schedule([i.op for i in outs])
    output = outs[0].op

    def callback(op):
        if len(list(op.reduce_axis)):
            from .looptiler import analyze_tiling
            points = list(analyze_tiling(op, pattern,
                                         max_parallel=max_threads))
            fobj = lambda x: (2**-x[0]) * (2**-x[1]) * x[2] * (x[3] * x[
                3] if 2 <= x[3] <= 8 else 1.0 / x[3])
            points.sort(key=fobj)
            points = points[::-1]
            #for x in points[::-1]:
            #    print((2 ** -x[0]), (2 ** -x[1]), x[2], (x[3] * x[3] if 2 <= x[3] <= 8 else 1.0 / x[3]))
            #    print(x[-1])

            a, b = op.input_tensors
            tune.ashape = get_const_tuple(a.shape)
            tune.bshape = get_const_tuple(b.shape)
            try:
                tune.strides = strides
            except:
                tune.strides = 'dense'

            if tune.cpu_idx is None:
                to_apply = points[0][-1]
                import os
                HOME = os.getenv("HOME")
                try:
                    f = open(HOME + '/Tensorization-PoC/cpu-shapes.log', 'a')
                except:
                    f = open(HOME + '/UNIT/cpu-shapes.log', 'a')
                f.write(f'{tune.ashape} {tune.bshape} {tune.strides}\n')
                if (tune.ashape, tune.bshape, tune.strides) in tune.x86.keys():
                    to_apply = points[tune.x86[(tune.ashape, tune.bshape,
                                                tune.strides)]][-1]
            else:
                tune.total_idx = len(points)
                to_apply = points[tune.cpu_idx][-1]

            to_schedule = output
            loops = []
            parallel_level = None
            for i in range(len(output.axis)):

                if isinstance(to_apply[i][0],
                              tuple) and to_apply[i][0][1] == 'parallel':
                    to_schedule = op
                    if str(op) != str(output):
                        outer, inner = sch[output].split(
                            output.axis[i], nparts=to_apply[i][0][0])
                        parallel_level = outer
                        sch[op].compute_at(sch[output], outer)
                        if i == len(output.axis) - 1:
                            sch[output].vectorize(inner)
                        else:
                            sch[output].vectorize(output.axis[-1])

                to_append = []
                to_split = to_schedule.axis[i]

                for j in to_apply[i][1:][::-1]:
                    if isinstance(j, int):
                        outer, inner = sch[to_schedule].split(to_split, j)
                        to_split = outer
                    else:
                        outer, inner = sch[to_schedule].split(to_split, j[0])
                        to_split = outer

                    to_append = [inner] + to_append
                to_append = [to_split] + to_append
                loops += to_append

            for i in range(len(op.reduce_axis)):
                to_split = op.reduce_axis[i]
                to_append = []
                for j in to_apply[i + len(op.axis)][1:][::-1]:
                    if isinstance(j, int):
                        outer, inner = sch[op].split(to_split, j)
                        to_split = outer
                    else:
                        outer, inner = sch[op].split(to_split, j[0])
                        to_split = outer
                    to_append = [inner] + to_append
                to_append = [to_split] + to_append
                loops += to_append

            annot = []
            for i, elem in enumerate(to_apply):
                for j in elem:
                    if isinstance(j, int):
                        annot.append(None if i < len(op.axis) else 'reduce')
                    else:
                        annot.append(j[1])
            assert len(annot) == len(loops), '%d != %d' % (len(annot),
                                                           len(loops))

            unroll, stencil, simple, reduction = [], [], [], []
            for i, elem in enumerate(zip(annot, loops)):
                # print(elem)
                hint, axis = elem
                if unroll and hint is None:
                    unroll.append(axis)
                elif hint == 'parallel':
                    fusion = sch[output].fuse(*(simple + [
                        parallel_level if parallel_level is not None else axis
                    ]))
                    sch[output].parallel(fusion)
                    if str(op) != str(output):
                        sch[op].compute_at(sch[output], fusion)
                    simple = []
                elif hint == 'unroll':
                    unroll.append(axis)
                elif hint == 'offload':
                    stencil.append(axis)
                elif hint == 'reduction':
                    reduction.append(axis)
                else:
                    simple.append(axis)
            sch[op].pragma(stencil[0], 'tensorize', pragma)

            if tune.parallel_only:
                if str(op) != str(output):
                    sch[op].reorder(*(simple + unroll + reduction + stencil))
                else:
                    sch[op].reorder(*([fusion] + unroll + simple + reduction +
                                      stencil))
                return

            for i in unroll:
                sch[op].unroll(i)
            #if simple:
            #    unroll = [simple[-1]] + unroll
            #    simple = simple[:-1]
            if str(op) != str(output):
                sch[op].reorder(*(simple + reduction + unroll + stencil))
            else:
                sch[op].reorder(*([fusion] + simple + reduction + unroll +
                                  stencil))

    traverse_inline(sch, output, callback)

    return sch