def pad(data, pad_before, pad_after=None, pad_value=0.0, name="pad"): n = len(data.shape) pad_after = pad_after if pad_after else pad_before if len(pad_before) != n: raise ValueError("Input dimension and pad_before dismatch : %d vs %d" % (n, len(pad_before))) if len(pad_after) != n: raise ValueError("Input dimension and pad_after dismatch : %d vs %d" % (n, len(pad_after))) out_shape = tuple( tvm.ir_pass.Simplify((data.shape[i] + tvm.const(pad_before[i] + pad_after[i]))) for i in range(n)) pad_value = pad_value if isinstance( pad_value, tvm.expr.Expr) else tvm.const(pad_value, data.dtype) def _pad(*indices): not_zero = [] index_tuple = [] for i in range(n): if pad_before[i] == 0 and pad_after[i] == 0: index_tuple.append(indices[i]) else: index_tuple.append(indices[i] - pad_before[i]) not_zero.append(indices[i] >= pad_before[i]) not_zero.append(indices[i] < data.shape[i] + pad_before[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.select(not_zero, data[tuple(index_tuple)], pad_value) return data[tuple(index_tuple)] return hcl.compute(out_shape, _pad, name=name)
def avg_pool2d_nhwc(data, pooling, stride=[1, 1], padding=[0, 0], name='avg_pool2d'): assert len(data.shape) == 4, "only support 4-dim pooling" assert len(stride) == 2, "only support 2-dim stride" pooling_h, pooling_w = pooling stride_h, stride_w = stride batch, height, width, channel = data.shape pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (pooling_h, pooling_w)) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_bottom, pad_right, 0] data = pad(data, pad_before, pad_after, pad_value=tvm.const(0.0, data.dtype)) out_height = simplify((height - pooling_h + pad_top + pad_bottom) // stride_h + 1) out_width = simplify((width - pooling_w + pad_left + pad_right) // stride_w + 1) dheight = hcl.reduce_axis(0, pooling_h) dwidth = hcl.reduce_axis(0, pooling_w) return hcl.compute( (batch, out_height, out_width, channel), lambda i, h, w, c: sum( data[i, h * stride_h + dheight, w * stride_w + dwidth, c], axis=[dheight, dwidth]) / (pooling_w * pooling_h), name=name, attrs=OrderedDict([('out_img_w', out_width), ('out_img_h', out_height), ('in_num', channel), ('kernel_h', pooling[1]), ('kernel_w', pooling[0]), ('stride_h', stride[1]), ('stride_w', stride[0]), ('app_name', tvm.make.StringImm('avg_pool'))]))
def pad(data, pad_before, pad_after=None, pad_value=0.0, name='pad'): n = len(data.shape) pad_after = pad_after if pad_after else pad_before out_shape = tuple( tvm.ir_pass.Simplify((data.shape[i] + tvm.const(pad_before[i]) + tvm.const(pad_after[i]))) for i in range(n)) def _pad(*indices): not_zero = [] index_tuple = [] for i in range(n): if equal_const_int(pad_before[i], 0) and equal_const_int( pad_after[i], 0): index_tuple.append(indices[i]) else: index_tuple.append(indices[i] - pad_before[i]) not_zero.append(indices[i] >= pad_before[i]) not_zero.append(indices[i] < data.shape[i] + pad_before[i]) if not_zero: not_zero = tvm.all(*not_zero) return tvm.select(not_zero, data[tuple(index_tuple)], pad_value) return data[tuple(index_tuple)] return hcl.compute(out_shape, _pad, name=name)
def test_build_from_stmt(): hcl.init(hcl.Int()) # First, we still need to create HeteroCL inputs A = hcl.placeholder((10,), "A") B = hcl.placeholder((10,), "B") X = hcl.placeholder((), "X") # a scalar input # Second, we create variables for loop var # The first field is the name # The second field is the data type i = tvm._api_internal._Var("i", "int32") # Similarly, we can create a variable for intermediate tensor C = tvm._api_internal._Var("C", "int32") # Third, we can create Load # If we are accessing the HeteroCL inputs, we need to use ".buf.data" load = tvm.make.Load("int32", A.buf.data, i) # Fourth, for arithmatic operation, we can add "False" to the end # This avoids automatic casting add = tvm.make.Add(load, 1, False) # Fifth, we can create Store # In this case, we just write to the intermediate tensor # Thus, we don't need to use ".buf.data" store = tvm.make.Store(C, add, i) # Sixth, we can create the loop with our loop var # For the details of each field, please refer to IR.h under HalideIR/src/ir loop = tvm.make.For(i, 0, 10, 0, 0, store) # Finally, we need to allocate memory for our intermediate tensor alloc = tvm.make.Allocate(C, "int32", [10], tvm.const(1, "uint1"), loop, []) # Similarly, we can do another loop that write stuffs to B # Note that this i is a newly allocated variable though the name is the same # We cannot reuse the same i for different loops i = tvm._api_internal._Var("i", "int32") load = tvm.make.Load("int32", C, i) mul = tvm.make.Mul(load, X, False) store = tvm.make.Store(B.buf.data, mul, i) loop = tvm.make.For(i, 0, 10, 0, 0, store) stmt = tvm.make.Block(alloc, loop) # Finally, we just need to use HeteroCL APIs to build the function # Note that with this approach, we cannot apply any optimizations with primitives s = hcl.create_schedule([A, B, X]) # Just specify the stmt to be the statement we built f = hcl.build(s, stmt=stmt) # A simple test np_A = np.random.randint(10, size=10) np_B = np.random.randint(10, size=10) hcl_A = hcl.asarray(np_A) hcl_B = hcl.asarray(np_B) f(hcl_A, hcl_B, 5) np_golden = 5 * (np_A + 1) np_B = hcl_B.asnumpy() assert(np.array_equal(np_B, np_golden))