def test_dense_dense():
    M, N, K = 128, 128, 128
    data_shape = (M, K)
    weight_shape = (N, K)
    relay_mod = tvm.IRModule.from_expr(get_dense_dense(data_shape, weight_shape))
    data_np = np.random.randn(*data_shape).astype("float32")
    weight1_np = np.random.randn(*weight_shape).astype("float32")
    weight2_np = np.random.randn(*weight_shape).astype("float32")
    target = "llvm"
    params = {"weight1": weight1_np, "weight2": weight2_np}

    def schedule_fn(task, sch):
        if "nn_dense_nn_dense" in task.task_name:
            schedule_dense_dense(sch)
            return True
        return False

    database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
    with ms.ApplyHistoryBest(database):
        with tvm.transform.PassContext(
            opt_level=3,
            config={"relay.backend.use_meta_schedule": True},
        ):
            lib = relay.build(relay_mod, target=target, params=params)

    dev = tvm.device(target, 0)
    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
    runtime.set_input("data", data_np)
    runtime.run()
    out = runtime.get_output(0).numpy()
    ref = get_ref(data_np, weight1_np, weight2_np)
    tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
예제 #2
0
def tune_each_task(
    mod,
    target,
    config,
    runner,
    work_dir,
    params,
):
    extracted_tasks = ms.extract_task_from_relay(mod, target, params)
    database = ms.database.JSONDatabase(
        path_workload=os.path.join(work_dir, "default_database_workload.json"),
        path_tuning_record=os.path.join(work_dir, "default_database_tuning_record.json"),
    )
    for task in extracted_tasks:
        # pylint: disable=protected-access
        tune_context = ms.tune.Parse._tune_context(
            tune_context=None,
            mod=ms.tune.Parse._mod(task.dispatched[0]),
            target=target,
            config=config,
            task_name=task.task_name,
            space_generator=None,
            sch_rules=None,
            postprocs=None,
            mutator_probs=None,
            num_threads=os.cpu_count(),
        )
        task_scheduler = ms.tune.Parse._task_scheduler(
            None,
            [tune_context],
            task_weights=[1.0],
            builder=ms.tune.Parse._builder(None),
            runner=ms.tune.Parse._runner(runner),
            database=database,
            max_trials=config.max_trials_per_task,
            cost_model=ms.tune.Parse._cost_model(None),
            measure_callbacks=ms.tune.Parse._callbacks(None),
        )
        # pylint: enable=protected-access
        task_scheduler.tune()
    with target, ms.ApplyHistoryBest(database):
        with PassContext(
            opt_level=3,
            config={"relay.backend.use_meta_schedule": True},
        ):
            return relay_build(mod, target=target, params=params)
def test_meta_schedule_integration_apply_history_best():
    mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
    database = ms.database.MemoryDatabase()
    env = ms.ApplyHistoryBest(database)
    target = Target("llvm")
    workload = database.commit_workload(MockModule)
    database.commit_tuning_record(
        ms.database.TuningRecord(
            trace=Schedule(MockModule).trace,
            workload=workload,
            run_secs=[1.0],
            target=target,
            args_info=[],
        ))
    mod = env.query(
        task_name="mock-task",
        mod=mod,
        target=target,
        dispatched=[MockModule],
    )
    assert tvm.ir.structural_equal(mod, workload.mod)
예제 #4
0
def manual_tir_common(do_tune=False):
    M, N, K = 1024, 1024, 1024  # pylint: disable=invalid-name
    data_shape = (M, K)
    weight_shape = (N, K)

    data_dtype = "uint8"
    data = relay.var("data", shape=data_shape, dtype=data_dtype)
    weight = relay.var("weight", shape=weight_shape, dtype="int8")
    bias = relay.var("bias", shape=(weight_shape[0], ), dtype="int32")

    # dense is tuned by the TIR schedule above, bmm is scheduled by TE (topi/x86/batch_matmul.py)
    dense = relay.nn.dense(data, weight, out_dtype="int32")
    bias_add = relay.nn.bias_add(dense, bias) + relay.const(1, dtype="int32")
    out = relay.nn.batch_matmul(
        relay.cast(relay.expand_dims(bias_add, 0), "uint8"),
        relay.cast(relay.expand_dims(bias_add, 0), "int8"),
        out_dtype="int32",
    )

    relay_mod = tvm.IRModule.from_expr(out)

    target = "llvm -mcpu=cascadelake -num-cores 4"
    dev = tvm.device(target, 0)

    data = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
    weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8")
    bias_np = np.random.uniform(1, 10,
                                size=(weight_shape[0], )).astype("int32")

    ref = (relay.create_executor(
        "vm", mod=relay_mod, device=dev,
        target=target).evaluate()(*[data, weight_np, bias_np]).numpy())

    params = {"weight": weight_np, "bias": bias_np}

    if do_tune:
        extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
        # Filter out tasks that we don't intend to schedule / tune with TIR.
        tune_tasks = list(
            filter(
                lambda task: "dense" in task.task_name,
                extracted_tasks,
            ))
        config = ms.TuneConfig(
            strategy="replay_trace",
            num_trials_per_iter=64,
            max_trials_per_task=20000,
            max_trials_global=20000,
        )

        with tempfile.TemporaryDirectory() as work_dir:
            # postprocs=lambda: [] is important to prevent default post processors from
            # tampering with the manual schedule.
            database = ms.tune_extracted_tasks(
                tune_tasks,
                config,
                work_dir=work_dir,
                postprocs=lambda: [],
            )
    else:

        def schedule_fn(task, sch):
            if "dense" not in task.task_name:
                return False

            block = sch.get_block("compute")

            # Looks up schedule_rule annotation.
            # See the comment in test_tune_relay_manual_tir_vnni().
            schedule_rule = sch.get(block).annotations["schedule_rule"]

            assert "dense_vnni" in schedule_rule

            schedule_dense(block, M, False, sch)

            return True

        database = apply_fixed_schedules(relay_mod, target, params,
                                         schedule_fn)

    with ms.ApplyHistoryBest(database):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_meta_schedule": True},
        ):
            # pylint: disable=W0105
            """
            The log should say
            Warning: Cannot find workload: tvmgen_default_fused_expand_dims
            Warning: Cannot find workload: tvmgen_default_fused_cast
            Warning: Cannot find workload: tvmgen_default_fused_cast_1
            Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul

            This means batch matmul and others are scheduled by TE, and dense (the one not warned)
            is found in the meta schedule tuning database during ApplyHistoryBest
            """
            # pylint: enable=W0105
            lib = relay.build(relay_mod, target=target, params=params)

    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))

    runtime.set_input("data", data)
    runtime.run()

    out = runtime.get_output(0).numpy()

    np.testing.assert_equal(out, ref)
예제 #5
0
def test_meta_schedule_relay_lowering():
    data_shape = (1, 3, 16, 16)
    weight_shape = (8, 3, 5, 5)
    data = relay.var("data", relay.TensorType(data_shape, "float32"))
    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
    y = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        kernel_layout="OIHW",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], y)
    mod = tvm.IRModule.from_expr(f)
    mod = relay.transform.InferType()(mod)

    data_sample = np.random.rand(*data_shape).astype("float32")
    weight_sample = np.random.rand(*weight_shape).astype("float32")
    params = {mod["main"].params[1].name_hint: weight_sample}

    input_name = "data"
    dev = tvm.cpu()
    target = Target("llvm --num-cores=16")
    data = tvm.nd.array(data_sample, dev)

    with tempfile.TemporaryDirectory() as work_dir:
        database = ms.database.JSONDatabase(
            osp.join(work_dir, "workload.json"),
            osp.join(work_dir, "records.json"))
        database.commit_tuning_record(
            ms.database.TuningRecord(
                Trace([], {}),
                database.commit_workload(
                    tvmgen_default_fused_nn_contrib_conv2d_NCHWc),
                [0.0],
                target=target,
                args_info=[],
            ))
        with ms.ApplyHistoryBest(database):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_meta_schedule": True},
            ):
                rt_mod1 = relay.build(mod, target=target, params=params)

        # Compile without meta-schedule for correctness check
        with tvm.transform.PassContext(opt_level=0):
            rt_mod2 = relay.build(mod, target=target, params=params)

        def get_output(data, lib):
            module = graph_executor.GraphModule(lib["default"](dev))
            module.set_input(input_name, data)
            module.run()
            return module.get_output(0).numpy()

        # Check correctness
        actual_output = get_output(data, rt_mod1)
        expected_output = get_output(data, rt_mod2)
        assert np.allclose(actual_output,
                           expected_output,
                           rtol=1e-4,
                           atol=2e-4)
예제 #6
0
def test_meta_schedule_te2primfunc_argument_order():
    @ms.derived_object
    class TestDummyDatabase(ms.database.PyDatabase):
        def __init__(self):
            super().__init__()
            self.records = []
            self.workload_reg = []

        def has_workload(self, mod: IRModule) -> ms.database.Workload:
            for workload in self.workload_reg:
                if tvm.ir.structural_equal(workload.mod, mod):
                    return True
            # The database has already put in all correct workloads
            raise ValueError(
                "The workload searched for is not in given database!" +
                " Incorrect TIR was generated from TE subgraph.")

        def commit_tuning_record(self,
                                 record: ms.database.TuningRecord) -> None:
            self.records.append(record)

        def commit_workload(self, mod: IRModule) -> ms.database.Workload:
            for workload in self.workload_reg:
                if tvm.ir.structural_equal(workload.mod, mod):
                    return workload
            workload = ms.database.Workload(mod)
            self.workload_reg.append(workload)
            return workload

        def get_top_k(
            self,
            workload: ms.database.Workload,
            top_k: int,
        ) -> List[ms.database.TuningRecord]:
            return list(
                filter(
                    lambda x: x.workload == workload,
                    sorted(self.records,
                           key=lambda x: sum(x.run_secs) / len(x.run_secs)),
                ))[:int(top_k)]

        def __len__(self) -> int:
            return len(self.records)

        def print_results(self) -> None:
            print("\n".join([str(r) for r in self.records]))

    data_shape = (1, 3, 16, 16)
    weight_shape = (8, 3, 5, 5)
    data = relay.var("data", relay.TensorType(data_shape, "float32"))
    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
    y = relay.nn.conv2d(
        data,
        weight,
        padding=(2, 2),
        kernel_size=(5, 5),
        kernel_layout="OIHW",
        out_dtype="float32",
    )
    f = relay.Function([data, weight], y)
    mod = tvm.IRModule.from_expr(f)
    mod = relay.transform.InferType()(mod)

    data_sample = np.random.rand(*data_shape).astype("float32")
    weight_sample = np.random.rand(*weight_shape).astype("float32")
    params = {mod["main"].params[1].name_hint: weight_sample}

    input_name = "data"
    dev = tvm.cpu()
    target = Target("llvm --num-cores=16")
    data = tvm.nd.array(data_sample, dev)

    database = TestDummyDatabase()
    database.commit_workload(tvmgen_default_fused_layout_transform)
    database.commit_workload(tvmgen_default_fused_layout_transform_1)
    database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc)

    with ms.ApplyHistoryBest(database):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_meta_schedule": True},
        ):
            rt_mod1 = relay.build(mod, target=target, params=params)

    # Compile without meta-schedule for correctness check
    with tvm.transform.PassContext(opt_level=0):
        rt_mod2 = relay.build(mod, target=target, params=params)

    def get_output(data, lib):
        module = graph_executor.GraphModule(lib["default"](dev))
        module.set_input(input_name, data)
        module.run()
        return module.get_output(0).numpy()

    # Check correctness
    actual_output = get_output(data, rt_mod1)
    expected_output = get_output(data, rt_mod2)
    assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)