Exemplo n.º 1
0
    def run(dtype):
        size_var_n = te.size_var("n")
        placeholder_a = te.placeholder((size_var_n, ), name="A", dtype=dtype)
        placeholder_b = te.placeholder((size_var_n, ), name="B", dtype=dtype)
        result_c = te.compute(
            placeholder_a.shape,
            lambda *i: te.fmod(placeholder_a(*i), placeholder_b(*i)),
            name="C")
        schedule = te.create_schedule(result_c.op)
        # create iter var and assign them tags.
        num_thread = 8
        axis0, axis1 = schedule[result_c].split(result_c.op.axis[0],
                                                factor=num_thread)

        def check_device(device):
            dev = tvm.device(device, 0)
            if not tvm.testing.device_enabled(device):
                print("skip because %s is not enabled.." % device)
                return
            target = tvm.target.Target(device)
            if "cpu" not in target.keys:
                schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x"))
                schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x"))
            fmod = tvm.build(schedule,
                             [placeholder_a, placeholder_b, result_c],
                             device,
                             name="myfmod")

            # launch the kernel.
            value_n = 1024
            a_np = (np.random.uniform(size=value_n) * 256).astype(
                placeholder_a.dtype)
            b_np = (np.random.uniform(size=value_n) * 256).astype(
                placeholder_b.dtype)

            # "fix" the values in a and b to avoid the result being too small
            b_np += (b_np < 2.0) * 2
            a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1

            buff_a = tvm.nd.array(a_np, dev)
            buff_b = tvm.nd.array(b_np, dev)
            buff_c = tvm.nd.array(np.zeros(value_n, dtype=result_c.dtype), dev)
            ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1)
            _ = ftimer(buff_a, buff_b, buff_c).mean
            np.testing.assert_allclose(buff_c.numpy(),
                                       np.mod(buff_a.numpy(), buff_b.numpy()),
                                       rtol=1e-5)

        check_device("cuda")
        check_device("opencl -device=intel_graphics")
        check_device("metal")
Exemplo n.º 2
0
    def run(dtype):
        n = te.size_var("n")
        A = te.placeholder((n, ), name="A", dtype=dtype)
        B = te.placeholder((n, ), name="B", dtype=dtype)
        C = te.compute(A.shape, lambda *i: te.fmod(A(*i), B(*i)), name="C")
        s = te.create_schedule(C.op)
        # create iter var and assign them tags.
        num_thread = 8
        bx, tx = s[C].split(C.op.axis[0], factor=num_thread)

        def check_device(device):
            ctx = tvm.context(device, 0)
            if not tvm.testing.device_enabled(device):
                print("skip because %s is not enabled.." % device)
                return
            target = tvm.target.Target(device)
            if "cpu" not in target.keys:
                s[C].bind(bx, te.thread_axis("blockIdx.x"))
                s[C].bind(tx, te.thread_axis("threadIdx.x"))
            fmod = tvm.build(s, [A, B, C], device, name="myfmod")

            # launch the kernel.
            n = 1024
            a_np = (np.random.uniform(size=n) * 256).astype(A.dtype)
            b_np = (np.random.uniform(size=n) * 256).astype(B.dtype)

            # "fix" the values in a and b to avoid the result being too small
            b_np += (b_np < 2.0) * 2
            a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1

            a = tvm.nd.array(a_np, ctx)
            b = tvm.nd.array(b_np, ctx)
            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
            ftimer = fmod.time_evaluator(fmod.entry_name, ctx, number=1)
            tcost = ftimer(a, b, c).mean
            # fmod(a, b, c)
            np.testing.assert_allclose(c.asnumpy(),
                                       np.mod(a.asnumpy(), b.asnumpy()),
                                       rtol=1e-5)

        check_device("cuda")
        check_device("opencl -device=intel_graphics")
        check_device("metal")