def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nchw' in op.tag: # print('Run in x86-rasp schedule') output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] padding = infer_pad(data, data_pad) if data_pad is None: stride = infer_stride(data, kernel, output) else: stride = infer_stride(data_pad, kernel, output) wkl = _get_workload(data, kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) return _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, outs[0])
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nChwc' in op.tag: print('Got conv2d_nChwc tag: ' + str(op.tag)) output = op.output(0) # conv_out = op.input_tensors[0] conv_out = output kernel = conv_out.op.input_tensors[1] # kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] \ if isinstance(data_vec.op, tvm.tensor.ComputeOp) and len(data_vec.op.input_tensors) > 0 and "pad" not in data_vec.op.tag \ else data_vec data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block original_data = tvm.placeholder((n, ic, h, w), dtype=output.dtype) if data_pad is not None: n, _, pad_h, pad_w, _ = [x.value for x in data_pad.shape] original_data_pad = tvm.placeholder((n, ic, pad_h, pad_w), dtype=output.dtype) padding = infer_pad(original_data, original_data_pad) else: padding = (0, 0) oc, kh, kw = kernel_size original_kernel = tvm.placeholder((oc, ic, kh, kw), dtype=output.dtype) n, oc_chunk, oh, ow, oc_block = [x.value for x in output.shape] original_output = tvm.placeholder((n, oc_chunk * oc_block, oh, ow), dtype=output.dtype) if data_pad is None: stride = infer_stride(original_data, original_kernel, original_output) else: stride = infer_stride(original_data_pad, original_kernel, original_output) wkl = _get_workload(original_data, original_kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel, conv_out, output, outs[0])
def traverse(s, op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op)
def traverse(OP): # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(OP.tag): if OP not in s.outputs: s[OP].compute_inline() for tensor in OP.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) # schedule depthwise_conv2d if OP.tag == 'depthwise_conv2d_nhwc': PaddedInput = OP.input_tensors[0] Filter = OP.input_tensors[1] DepthwiseConv2d = OP.output(0) _schedule(PaddedInput, Filter, DepthwiseConv2d)
def traverse(operator): """Traverse operators from computation graph""" if tag.is_broadcast(operator.tag): if operator not in sch.outputs: sch[operator].compute_inline() for tensor in operator.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) elif operator.tag == 'conv2d_nhwc': Apad = operator.input_tensors[0] W = operator.input_tensors[1] B = operator.output(0) schedule(Apad, W, B) else: raise RuntimeError("Unsupported operator: %s" % operator.tag)
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'spatial_conv_output' in op.tag: # print('Run in x86-rasp schedule') output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] _schedule_spatial_conv2d(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, outs[0]) if 'im2col_conv_output' in op.tag: output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data_col = data_vec.op.input_tensors[0] data = data_col.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] _schedule_im2col_conv2d(s, data, data_pad, data_col, data_vec, kernel, kernel_vec, conv_out, output, outs[0])
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nChwc' in op.tag: output = op.output(0) # conv_out = op.input_tensors[0] conv_out = op.input_tensors[0] if 'conv2d_nChwc_unpack' in op.tag else output kernel = conv_out.op.input_tensors[1] # kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] \ if isinstance(data_vec.op, tvm.tensor.ComputeOp) and len(data_vec.op.input_tensors) > 0 and "pad" not in data_vec.op.tag \ else data_vec data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] ndim_input = len(data.shape) if ndim_input == 5: n, ic_chunk, h, w, ic_block = [x.value for x in data.shape] ic = ic_chunk * ic_block else: n, ic, h, w = [x.value for x in data.shape] original_data = tvm.placeholder((n, ic, h, w), dtype=output.dtype) oc = num_filter kh, kw = kernel_size original_kernel = tvm.placeholder((oc, ic, kh, kw), dtype=output.dtype) wkl = _get_workload(original_data, original_kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) _SCH_TO_SCH_FUNC[type(sch)](s, wkl, data, data_pad, data_vec, kernel, conv_out, output, outs[0])
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nchw' in op.tag: output = op.output(0) conv_out = op.input_tensors[0] kernel = conv_out.op.input_tensors[1] # kernel = kernel_vec.op.input_tensors[0] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] padding = infer_pad(data, data_pad) _, ic, _, _ = [x.value for x in data.shape] oc = num_filter kh, kw = kernel_size original_kernel = tvm.placeholder((oc, ic, kh, kw)) if data_pad is None: stride = infer_stride(data, original_kernel, output) else: stride = infer_stride(data_pad, original_kernel, output) wkl = _get_workload(data, original_kernel, stride, padding, output.dtype) sch = _get_schedule(wkl) _SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec, kernel, conv_out, output, outs[0])
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nchw' in op.tag: conv = op.output(0) kernel = op.input_tensors[1] data = op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] C = conv print(C.op.axis) print(C.op.reduce_axis) print(data_pad.op.axis) n, c, h, w = C.op.axis rc, ry, rx = C.op.reduce_axis s[C].reorder(n, c, rc, h, w, ry, rx) r = s[C].fuse(ry, rx) s[C].unroll(r) w = s[C].fuse(h, w) xo, xi = s[C].split(w, factor=8) xoo, xoi = s[C].split(xo, factor=7) s[C].parallel(c) s[C].vectorize(xi) s[C].pragma(n, "parallel_launch_point")
def traverse(OP): print("***********") print(OP.tag) # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(OP.tag): # print("is broadcast") if OP not in s.outputs: s[OP].compute_inline() for tensor in OP.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) elif tag.is_injective(OP.tag): # print("is injective") for tensor in OP.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) # schedule depthwise_conv2d elif OP.tag == 'depthwise_1by1_fused_nchw': PaddedInput = OP.input_tensors[0] Filter_d = OP.input_tensors[1] Filter_1 = OP.input_tensors[2] Depthwise1by1Fused = OP.output(0) _schedule(PaddedInput, Filter_d, Filter_1, Depthwise1by1Fused)
def traverse(op): """Traverse operators from computation graph""" # inline all one-to-one-mapping operators except the last stage (output) if tag.is_broadcast(op.tag): if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: if tensor.op.input_tensors: traverse(tensor.op) if 'conv2d_nchw' in op.tag: conv = op.output(0) kernel = op.input_tensors[1] data = op.input_tensors[0] data_pad = None if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: data_pad = data data = data_pad.op.input_tensors[0] C = conv print(C.op.axis) print(C.op.reduce_axis) print(data_pad.op.axis) # WW = s.cache_read(data_pad, 'global', [C]) n, c, h, w = C.op.axis rc, ry, rx = C.op.reduce_axis c_chunk, c_block = s[C].split(c, factor=simd_w) s[C].reorder(n, c_chunk, h, c_block, w, rc, ry, rx) work_amount = s[C].fuse(c_chunk, h) ur_w, wi = s[C].split(w, nparts=n_reg) s[C].reorder(n, work_amount, rx, c_block, wi, ry, ur_w, rc) # s[C].reorder(n, work_amount, c_block, wi, rx, ry, ur_w, rc) # s[C].fuse(rx, ry) nthread, nthread_inner = s[C].split(work_amount, nparts=4) # s[WW].compute_at(s[C], ur_w) # print(WW.op.axis) # step = s[C].fuse(rx, c_block) # r = s[C].fuse(ur_w, rc) # s[C].reorder(n, c, rc, h, w, ry, rx) # r = s[C].fuse(ry, rx) # s[C].unroll(rc) # s[C].unroll(ry) # s[C].unroll(rx) # xo, xi = s[C].split(w, factor=8) # s[C].unroll(ry) # s[C].unroll(rc) # s[C].parallel(work_amount) s[C].parallel(nthread) s[C].vectorize(c_block) s[C].pragma(n, "parallel_launch_point") s[C].pragma(nthread, "parallel_stride_pattern") s[C].pragma(n, "parallel_barrier_when_finish")