Exemplo n.º 1
0
    def tune(self, config: TuneConfig = None, target: Union[str, Target] = None):
        """
        Tune the TVMscript code.

        Parameters
        ----------
        config: Optional[TuneConfig]
            The tuning configuration.

        target : Optional[str, Target]
            The target to tune for.
        """
        if config is None:
            config = TuneConfig(
                # Default setting
                strategy="replay_trace",
                num_trials_per_iter=32,
                max_trials_per_task=32,
                max_trials_global=32,
            )
        if target is None:
            target = Target("llvm --num-cores=16")
        with tempfile.TemporaryDirectory() as work_dir:
            sch: Schedule = tune_tir(
                mod=self.ir_module,
                target=target,
                config=config,
                work_dir=work_dir,
            )
            self.ir_module = sch.mod
            self.build(target)
Exemplo n.º 2
0
def test_meta_schedule_tune_relay(
    model_name: str,
    input_shape: List[int],
    target: str,
):
    dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda()
    if model_name.startswith("bert"):
        data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape),
                            dev)  # embedding size
    else:
        data = tvm.nd.array(
            np.random.randn(*input_shape).astype("float32"), dev)

    mod, params, (input_name, _, _) = get_network(name=model_name,
                                                  input_shape=input_shape)
    target = Target(target)
    with tempfile.TemporaryDirectory() as work_dir:
        database = DummyDatabase()
        rt_mod: tvm.runtime.Module = tune_relay(
            mod=mod,
            params=params,
            target=target,
            config=ReplayTraceConfig(
                num_trials_per_iter=32,
                num_trials_total=32,
            ),
            work_dir=work_dir,
            database=database,
        )
        # Compile without meta-scheduler for correctness check
        with tvm.transform.PassContext(opt_level=0):
            rt_mod2 = relay.build(mod, target=Target("llvm"), params=params)

        def get_output(data, lib):
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input(input_name, data)
            module.run()
            return module.get_output(0).numpy()

        # Check correctness
        actual_output = get_output(data, rt_mod)
        expected_output = get_output(
            tvm.nd.array(data.numpy(), device=tvm.cpu()), rt_mod2)
        assert np.allclose(actual_output,
                           expected_output,
                           rtol=1e-4,
                           atol=2e-4)
Exemplo n.º 3
0
def test_tvmscript_torch_loop_split():
    x = torch.rand(128, 128).cuda()
    y = torch.zeros(128).cuda()

    result = torch.sum(x.cpu(), dim=1).numpy()

    loop_split.tune(config, Target("nvidia/geforce-rtx-3070"))
    loop_split(x, y)

    tvm.testing.assert_allclose(y.cpu().numpy(), result, atol=1e-5, rtol=1e-5)
def test_meta_schedule_tune_relay(
    model_name: str,
    input_shape: List[int],
    target: str,
):
    dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda()
    if model_name.startswith("bert"):
        data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape),
                            dev)  # embedding size
    else:
        data = tvm.nd.array(
            np.random.randn(*input_shape).astype("float32"), dev)

    mod, params, (input_name, _, _) = get_network(name=model_name,
                                                  input_shape=input_shape)
    target = Target(target)
    with tempfile.TemporaryDirectory() as work_dir:
        rt_mod1: tvm.runtime.Module = tune_relay(
            mod=mod,
            params=params,
            target=target,
            config=TuneConfig(
                strategy="evolutionary",
                num_trials_per_iter=32,
                max_trials_per_task=20000,
                max_trials_global=20000,
                search_strategy_config={
                    "genetic_num_iters": 10,
                },
            ),
            work_dir=work_dir,
            database=JSONDatabase(
                osp.join(work_dir, "workload.json"),
                osp.join(work_dir, "records.json"),
            ),
        )
        # Compile without meta-scheduler for correctness check
        with tvm.transform.PassContext(opt_level=0):
            rt_mod2 = relay.build(mod, target=target, params=params)

        def get_output(data, lib):
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input(input_name, data)
            module.run()
            return module.get_output(0).numpy()

        # Check correctness
        actual_output = get_output(data, rt_mod1)
        expected_output = get_output(data, rt_mod2)
        assert np.allclose(actual_output,
                           expected_output,
                           rtol=1e-4,
                           atol=2e-4)
Exemplo n.º 5
0
def test_tune_matmul_cuda():
    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=matmul,
            target=Target("nvidia/geforce-rtx-3070"),
            config=ReplayTraceConfig(
                num_trials_per_iter=32,
                num_trials_total=32,
            ),
            work_dir=work_dir,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)
def test_tune_matmul_cpu():
    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=matmul,
            target=Target("llvm --num-cores=16"),
            config=ReplayTraceConfig(
                num_trials_per_iter=32,
                max_trials_per_task=32,
                max_trials_global=32,
            ),
            work_dir=work_dir,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)
Exemplo n.º 7
0
def test_tune_matmul():
    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_te(
            tensors=te_workload.batch_matmul_nkkm(B=1, N=128, M=128, K=128),
            target=Target("llvm --num-cores=16"),
            config=ReplayTraceConfig(
                num_trials_per_iter=32,
                max_trials_per_task=32,
                max_trials_global=32,
            ),
            work_dir=work_dir,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)
def test_meta_schedule_relay_lowering():
    data_shape = (1, 3, 16, 16)
    weight_shape = (8, 3, 5, 5)
    data = relay.var("data", relay.TensorType(data_shape, "float32"))
    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
    y = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        kernel_layout="OIHW",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], y)
    mod = tvm.IRModule.from_expr(f)
    mod = relay.transform.InferType()(mod)

    data_sample = np.random.rand(*data_shape).astype("float32")
    weight_sample = np.random.rand(*weight_shape).astype("float32")
    params = {mod["main"].params[1].name_hint: weight_sample}

    input_name = "data"
    dev = tvm.cpu()
    target = Target("llvm --num-cores=16")
    data = tvm.nd.array(data_sample, dev)

    with tempfile.TemporaryDirectory() as work_dir:
        database = JSONDatabase(osp.join(work_dir, "workload.json"),
                                osp.join(work_dir, "records.json"))

        database.commit_tuning_record(
            TuningRecord(
                Trace([], {}),
                [0.0],
                database.commit_workload(
                    tvmgen_default_fused_nn_contrib_conv2d_NCHWc),
                target=target,
                args_info=[],
            ))

        with ApplyHistoryBest(database):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_meta_schedule": True},
            ):
                rt_mod1 = relay.build(mod, target=target, params=params)

        # Compile without meta-scheduler for correctness check
        with tvm.transform.PassContext(opt_level=0):
            rt_mod2 = relay.build(mod, target=target, params=params)

        def get_output(data, lib):
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input(input_name, data)
            module.run()
            return module.get_output(0).numpy()

        # Check correctness
        actual_output = get_output(data, rt_mod1)
        expected_output = get_output(data, rt_mod2)
        assert np.allclose(actual_output,
                           expected_output,
                           rtol=1e-4,
                           atol=2e-4)
def test_meta_schedule_te2primfunc_argument_order():
    @derived_object
    class TestDummyDatabase(PyDatabase):
        def __init__(self):
            super().__init__()
            self.records = []
            self.workload_reg = []

        def has_workload(self, mod: IRModule) -> Workload:
            for workload in self.workload_reg:
                if tvm.ir.structural_equal(workload.mod, mod):
                    return True
            # The database has already put in all correct workloads
            raise ValueError(
                "The workload searched for is not in given database!" +
                " Incorrect TIR was generated from TE subgraph.")

        def commit_tuning_record(self, record: TuningRecord) -> None:
            self.records.append(record)

        def commit_workload(self, mod: IRModule) -> Workload:
            for workload in self.workload_reg:
                if tvm.ir.structural_equal(workload.mod, mod):
                    return workload
            workload = Workload(mod)
            self.workload_reg.append(workload)
            return workload

        def get_top_k(self, workload: Workload,
                      top_k: int) -> List[TuningRecord]:
            return list(
                filter(
                    lambda x: x.workload == workload,
                    sorted(self.records,
                           key=lambda x: sum(x.run_secs) / len(x.run_secs)),
                ))[:int(top_k)]

        def __len__(self) -> int:
            return len(self.records)

        def print_results(self) -> None:
            print("\n".join([str(r) for r in self.records]))

    data_shape = (1, 3, 16, 16)
    weight_shape = (8, 3, 5, 5)
    data = relay.var("data", relay.TensorType(data_shape, "float32"))
    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
    y = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        kernel_layout="OIHW",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], y)
    mod = tvm.IRModule.from_expr(f)
    mod = relay.transform.InferType()(mod)

    data_sample = np.random.rand(*data_shape).astype("float32")
    weight_sample = np.random.rand(*weight_shape).astype("float32")
    params = {mod["main"].params[1].name_hint: weight_sample}

    input_name = "data"
    dev = tvm.cpu()
    target = Target("llvm --num-cores=16")
    data = tvm.nd.array(data_sample, dev)

    database = TestDummyDatabase()
    database.commit_workload(tvmgen_default_fused_layout_transform)
    database.commit_workload(tvmgen_default_fused_layout_transform_1)
    database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc)

    with ApplyHistoryBest(database):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_meta_schedule": True},
        ):
            rt_mod1 = relay.build(mod, target=target, params=params)

    # Compile without meta-scheduler for correctness check
    with tvm.transform.PassContext(opt_level=0):
        rt_mod2 = relay.build(mod, target=target, params=params)

    def get_output(data, lib):
        module = graph_executor.GraphModule(lib["default"](dev))
        module.set_input(input_name, data)
        module.run()
        return module.get_output(0).numpy()

    # Check correctness
    actual_output = get_output(data, rt_mod1)
    expected_output = get_output(data, rt_mod2)
    assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
Exemplo n.º 10
0
# under the License.

import pytest

import tvm
import tvm.testing
from tvm import relay
from tvm.target.target import Target
from tvm.relay.backend import Runtime, Executor, graph_executor_codegen
from tvm.relay.build_module import _reconstruct_from_deprecated_options


@pytest.mark.parametrize(
    "target,executor,runtime",
    [
        [Target("c"), None, None],
        [Target("c -runtime=c"), None,
         Runtime("crt")],
        [Target("c -system-lib"), None,
         Runtime("cpp", {"system-lib": True})],
        [
            Target("c -runtime=c -system-lib"), None,
            Runtime("crt", {"system-lib": True})
        ],
        [Target("c -executor=aot"),
         Executor("aot"), None],
        [
            Target("c -executor=aot -interface-api=c"),
            Executor("aot", {"interface-api": "c"}),
            None,
        ],
Exemplo n.º 11
0
def test_meta_schedule_tune_relay(model_name: str, batch_size: int, target: str):
    if model_name == "inception_v3" and batch_size == 1:
        pytest.skip("inception_v3 does not handle batch_size of 1")

    input_shape: Tuple[int, ...]
    input_name = "input0"
    dev = tvm.cpu() if str(target).startswith("llvm") else cuda()
    if MODEL_TYPES[model_name] == MODEL_TYPE.TEXT_CLASSIFICATION:
        seq_length = 128
        input_name = "input_ids"
        input_shape = (batch_size, seq_length)
        data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev)  # embedding size
    else:
        if MODEL_TYPES[model_name] == MODEL_TYPE.IMAGE_CLASSIFICATION:
            input_shape = (batch_size, 3, 299, 299)
        elif MODEL_TYPES[model_name] == MODEL_TYPE.SEGMENTATION:
            input_shape = (batch_size, 3, 299, 299)
        elif MODEL_TYPES[model_name] == MODEL_TYPE.OBJECT_DETECTION:
            input_shape = (1, 3, 300, 300)
        elif MODEL_TYPES[model_name] == MODEL_TYPE.VIDEO_CLASSIFICATION:
            input_shape = (batch_size, 3, 3, 299, 299)
        else:
            raise ValueError("Unsupported model: " + model_name)
        data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev)

    output_shape: Tuple[int, int] = (batch_size, 1000)

    mod, params = get_torch_model(
        model_name=model_name,
        input_shape=input_shape,
        output_shape=output_shape,
        dtype="float32",
    )

    with tempfile.TemporaryDirectory() as work_dir:
        target = Target(target)
        database = DummyDatabase()
        rt_mod: tvm.module = tune_relay(
            mod=mod,
            params=params,
            target=target,
            config=ReplayTraceConfig(
                num_trials_per_iter=32,
                num_trials_total=32,
            ),
            work_dir=work_dir,
            database=database,
        )
        # Compile without meta-scheduler for correctness check
        with tvm.transform.PassContext(opt_level=0):
            rt_mod2 = relay.build(mod, target=target, params=params)

        def get_output(data, lib):
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input(input_name, data)
            module.run()
            return module.get_output(0).numpy()

        # Check correctness
        actual_output = get_output(data, rt_mod)
        expected_output = get_output(data, rt_mod2)
        assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
Exemplo n.º 12
0
def test_tune_matmul_cuda_tensor_core():
    n = 512
    mod = create_prim_func(te_workload.matmul_fp16(n, n, n))
    target = Target("nvidia/geforce-rtx-3070")
    config = ReplayTraceConfig(
        num_trials_per_iter=32,
        num_trials_total=320,
    )

    class DefaultTensorCore:
        @staticmethod
        def _sch_rules():
            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
                schedule_rule as M, )

            return [
                M.AutoInline(
                    into_producer=False,
                    into_consumer=True,
                    # into_cache_only=False,
                    inline_const_tensor=True,
                    disallow_if_then_else=False,
                    require_injective=False,
                    require_ordered=False,
                    disallow_op=None,
                ),
                M.MultiLevelTiling(
                    structure="SSSRRSRS",
                    tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"],
                    # use_tensor_core=True,
                    max_innermost_factor=64,
                    vector_load_lens=[1, 2, 3, 4],
                    reuse_read=schedule_rule.ReuseType(
                        req="must",
                        levels=[4],
                        scope="shared",
                    ),
                    reuse_write=schedule_rule.ReuseType(
                        req="no",
                        levels=[],
                        scope="",
                    ),
                ),
                M.AutoInline(
                    into_producer=True,
                    into_consumer=True,
                    # into_cache_only=True,
                    inline_const_tensor=True,
                    disallow_if_then_else=False,
                    require_injective=False,
                    require_ordered=False,
                    disallow_op=None,
                ),
                M.ParallelizeVectorizeUnroll(
                    max_jobs_per_core=-1,  # disable parallelize
                    max_vectorize_extent=-1,  # disable vectorize
                    unroll_max_steps=[0, 16, 64, 512, 1024],
                    unroll_explicit=True,
                ),
            ]

        @staticmethod
        def _postproc():
            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
                postproc as M, )

            return [
                # M.RewriteCooperativeFetch(),
                M.RewriteParallelVectorizeUnroll(),
                M.RewriteReductionBlock(),
                # M.RewriteTensorCore(),
                M.VerifyGPUCode(),
            ]

    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=mod,
            target=target,
            config=config,
            work_dir=work_dir,
            space=PostOrderApply(),
            sch_rules=DefaultTensorCore._sch_rules,
            postprocs=DefaultTensorCore._postproc,
            num_threads=None,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)

            from tvm.contrib import nvcc
            import numpy as np

            ctx = tvm.gpu(0)
            if nvcc.have_tensorcore(ctx.compute_version):
                with tvm.transform.PassContext():
                    func = tvm.build(sch.mod["main"], [], "cuda")
                    print(sch.mod.script())
                    print(func.imported_modules[0].get_source())
                a_np = np.random.uniform(size=(n, n)).astype("float16")
                b_np = np.random.uniform(size=(n, n)).astype("float16")
                a = tvm.nd.array(a_np, ctx)
                b = tvm.nd.array(b_np, ctx)
                c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx)
                evaluator = func.time_evaluator(func.entry_name,
                                                ctx,
                                                number=3,
                                                repeat=1,
                                                min_repeat_ms=40)
                print("matmul with tensor core: %f ms" %
                      (evaluator(a, b, c).mean * 1e3))

                np.testing.assert_allclose(
                    c.asnumpy(),
                    np.matmul(a_np.astype("float32"), b_np.astype("float32")),
                    rtol=1e-4,
                    atol=1e-4,
                )