def l2norm_instance(data, eps, axis=None): """Perform L2norm on the input data For axis=None, y(i, j) = x(i, j) / sqrt(max(sum(x^2), eps)) Parameters ---------- data : tvm.Tensor 4-D with NCHW or NHWC layout eps : float epsilon value axis : list of int axis over the normalization applied Returns ------- output : tvm.Tensor 4-D output with same shape """ assert len(data.shape) == 4, "only support 4-dim lrn" dot_value = topi.cpp.pow(data, 2.0) sum_value = topi.sum(dot_value, axis=axis, keepdims=True) expand_sum = topi.broadcast_to(sum_value, data.shape) return topi.broadcast_div(data, topi.sqrt(\ tvm.compute(expand_sum.shape, lambda i, j, k, l:\ tvm.max(expand_sum[i, j, k, l], eps), tag='l2norm')))
def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"): # Build the logic and compile the function A = tvm.placeholder(shape=lhs_shape, name="A") B = tvm.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return with tvm.target.create(device): s = topi.generic.schedule_broadcast(C) ctx = tvm.context(device, 0) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) else: raise NotImplementedError lhs_nd = tvm.nd.array(lhs_npy, ctx) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4) check_device("opencl") check_device("cuda") check_device("metal") check_device("rocm")
def verify_broadcast_binary_ele(lhs_shape, rhs_shape, typ="add"): # Build the logic and compile the function A = tvm.placeholder(shape=lhs_shape, name="A") B = tvm.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_broadcast(C) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) else: raise NotImplementedError lhs_nd = tvm.nd.array(lhs_npy, ctx) rhs_nd = tvm.nd.array(rhs_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx) for _ in range(1): foo(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4) check_device("vulkan") check_device("opencl") check_device("cuda") check_device("metal") check_device("rocm")
def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"): global TASK TASK = "bcast_binary_" + typ + "_lhs" +\ "_".join([str(ele) for ele in lhs_shape]) +\ "rhs" + "_".join([str(ele) for ele in rhs_shape]) A = tvm.te.placeholder(shape=lhs_shape, name="A") B = tvm.te.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError s = topi.cuda.schedule_broadcast(C) fcuda = tvm.build(s, [A, B, C], "cuda", name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) lhs_nd = tvm.nd.array(lhs_npy, tvm.gpu()) rhs_nd = tvm.nd.array(rhs_npy, tvm.gpu()) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.gpu()) for _ in range(2): fcuda(lhs_nd, rhs_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"): global TASK TASK = "bcast_binary_" + typ + "_lhs" +\ "_".join([str(ele) for ele in lhs_shape]) +\ "rhs" + "_".join([str(ele) for ele in rhs_shape]) A = tvm.placeholder(shape=lhs_shape, name="A") B = tvm.placeholder(shape=rhs_shape, name="B") if typ == "add": C = topi.broadcast_add(A, B) elif typ == "sub": C = topi.broadcast_sub(A, B) elif typ == "div": C = topi.broadcast_div(A, B) elif typ == "mul": C = topi.broadcast_mul(A, B) elif typ == "maximum": C = topi.broadcast_maximum(A, B) elif typ == "minimum": C = topi.broadcast_minimum(A, B) else: raise NotImplementedError s = topi.cuda.schedule_broadcast(C) fcuda = tvm.build(s, [A, B, C], "cuda", name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) rhs_npy = np.random.uniform(size=rhs_shape).astype(A.dtype) if typ == "add": out_npy = lhs_npy + rhs_npy elif typ == "sub": out_npy = lhs_npy - rhs_npy elif typ == "div": rhs_npy = np.abs(rhs_npy) + 0.001 out_npy = lhs_npy / rhs_npy elif typ == "mul": out_npy = lhs_npy * rhs_npy elif typ == "maximum": out_npy = np.maximum(lhs_npy, rhs_npy) elif typ == "minimum": out_npy = np.minimum(lhs_npy, rhs_npy) lhs_nd = tvm.nd.array(lhs_npy, tvm.gpu()) rhs_nd = tvm.nd.array(rhs_npy, tvm.gpu()) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.gpu()) for _ in range(2): fcuda(lhs_nd, rhs_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def lrn(data, size, axis=1, alpha=0.0001, beta=0.75, bias=2): """Perform the across channels local response normalisation on the input data. sum_sqr_up^i{x, y} = (bias+((alpha/size)* \ {sum_{j=max(0, i-size/2)}^{min(N-1,i+size/2)} \ (data^j{x,y})^2}))^beta output^i{x, y} = data^i{x, y}/sum_sqr_up^i{x, y} N is the number for input channels Parameters ---------- data : tvm.Tensor 4-D with shape [batch, channel, height, width] size : int normalisation window size axis : int input data layout channel axis default value is 1 for NCHW format bias : float offset to avoid dividing by 0 alpha : float to be divided beta : float exponent Returns ------- output : tvm.Tensor 4-D output with same shape """ assert len(data.shape) == 4, "only support 4-dim lrn" assert (size % 2) == 1, "size should be odd number" assert (axis == 1) or (axis == 3), "axis should 1 or 3 for NCHW and NHWC" ##Add padding on left & right of size radius first pad_after = pad_before = [0, 0, 0, 0] pad_after[axis] = pad_before[axis] = (size // 2) pad_data = pad(data, pad_before, pad_after, name="pad_data") rxs = tvm.reduce_axis((0, size), name='rxs') if axis == 1: #NCHW layout sqr_sum = tvm.compute( data.shape, lambda i, j, k, l: tvm.sum(pad_data[i, j + rxs, k, l] * pad_data[i, j + rxs, k, l], axis=rxs)) elif axis == 3: #NHWC layout sqr_sum = tvm.compute( data.shape, lambda i, j, k, l: tvm.sum(pad_data[i, j, k, l + rxs] * pad_data[i, j, k, l + rxs], axis=rxs)) sqr_sum_up = tvm.compute( data.shape, lambda i, j, k, l: tvm.power( (bias + (alpha * sqr_sum[i, j, k, l] / size)), beta)) return topi.broadcast_div(data, sqr_sum_up)