Пример #1
0
def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, postprocs):
    """Test tuning."""
    tgt = "cuda" if "nvidia" in target else target
    dev = tvm.device(tgt, 0)

    ref = (
        relay.create_executor("vm", mod=relay_mod, device=dev, target=tgt)
        .evaluate()(*[data_np, weight_np])
        .numpy()
    )

    params = {"weight": weight_np}

    extracted_tasks = extract_task_from_relay(relay_mod, target, params)

    tune_tasks = list(
        filter(
            lambda task: op_name in task.task_name,
            extracted_tasks,
        )
    )

    with tempfile.TemporaryDirectory() as work_dir:
        database = tune_extracted_tasks(
            tune_tasks,
            CONFIG,
            work_dir=work_dir,
            sch_rules=lambda: sch_rules,
            postprocs=lambda: postprocs,
        )

    with ApplyHistoryBest(database):
        with tvm.transform.PassContext(
            opt_level=3,
            config={"relay.backend.use_meta_schedule": True},
        ):
            lib = relay.build(relay_mod, target=target, params=params)

    if "cascadelake" in target:
        asm = lib.lib.get_source("asm")
        assert "vpdpbusd" in asm

    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))

    runtime.set_input("data", data_np)
    runtime.run()

    out = runtime.get_output(0).numpy()

    np.testing.assert_equal(out, ref)
Пример #2
0
def tune_relay_auto(
    mod: IRModule,
    target: Union[str, Target],
    config: TuneConfig,
    work_dir: str,
    backend: str = "graph",
    params: Optional[Dict[str, NDArray]] = None,
) -> Union[Module, vm.Executable]:
    """A wrapper of `tune_relay` but provide a default setting for the config.

    Parameters
    ----------
    mod : IRModule
        The module to tune.
    target : Union[str, Target]
        The target to tune for.
    config : TuneConfig
        The search strategy config.
    params : Optional[Dict[str, tvm.runtime.NDArray]]
        The associated parameters of the program
    work_dir : Optional[str]
        The working directory to save intermediate results.
    backend : str = "graph"
        The backend to use for relay compilation(graph / vm).

    Returns
    -------
    lib : Union[Module, tvm.runtime.vm.Executable]
        The built runtime module or vm Executable for the given relay workload.
    """
    target = default_config.target(target)
    extracted_tasks = extract_task_from_relay(mod, target, params)
    if config is None:
        config = TuneConfig(
            num_trials_per_iter=16,
            max_trials_global=16 * len(extracted_tasks),
        )
    database = tune_extracted_tasks(extracted_tasks, config, work_dir)
    relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend]
    with target, autotvm_silencer(), ApplyHistoryBest(database):
        with PassContext(
            opt_level=3,
            config={
                "relay.backend.use_meta_schedule": True,
                "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda",
            },
        ):
            return relay_build(mod, target=target, params=params)
def _test_bert_int8(target, sch_rules, postprocs):
    relay_mod, params, input_info = load_quantized_bert_base()

    relay_mod = relay.transform.FastMath()(relay_mod)

    extracted_tasks = extract_task_from_relay(relay_mod, target, params)

    tune_tasks = []

    for task in filter(
            lambda task: "dense" in task.task_name or "batch_matmul" in task.
            task_name,
            extracted_tasks,
    ):
        relay_func = list(task.mod.functions.values())[0]
        out_type = relay_func.body.checked_type

        if out_type.dtype != "float32":
            tune_tasks.append(task)

    with tempfile.TemporaryDirectory() as work_dir:
        database = tune_extracted_tasks(
            tune_tasks,
            config,
            work_dir=work_dir,
            sch_rules=lambda: sch_rules,
            postprocs=lambda: postprocs,
        )

    with ApplyHistoryBest(database):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_meta_schedule": True},
        ):
            lib = relay.build(relay_mod, target=target, params=params)

    dev = tvm.device("cuda" if "nvidia" in target else target, 0)
    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))

    inputs = []

    for name, shape in input_info:
        arr = np.random.uniform(1, 10, size=shape).astype("int64")
        runtime.set_input(name, arr)
        inputs.append(arr)

    print(runtime.benchmark(dev, number=1, repeat=50).mean)
def manual_tir_common(do_tune=False):
    M, N, K = 1024, 1024, 1024  # pylint: disable=invalid-name
    data_shape = (M, K)
    weight_shape = (N, K)

    data_dtype = "uint8"
    data = relay.var("data", shape=data_shape, dtype=data_dtype)
    weight = relay.var("weight", shape=weight_shape, dtype="int8")
    bias = relay.var("bias", shape=(weight_shape[0], ), dtype="int32")

    # dense is tuned by the TIR schedule above, bmm is scheduled by TE (topi/x86/batch_matmul.py)
    dense = relay.nn.dense(data, weight, out_dtype="int32")
    bias_add = relay.nn.bias_add(dense, bias) + relay.const(1, dtype="int32")
    out = relay.nn.batch_matmul(
        relay.cast(relay.expand_dims(bias_add, 0), "uint8"),
        relay.cast(relay.expand_dims(bias_add, 0), "int8"),
        out_dtype="int32",
    )

    relay_mod = tvm.IRModule.from_expr(out)

    target = "llvm -mcpu=cascadelake -num-cores 4"
    dev = tvm.device(target, 0)

    data = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
    weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8")
    bias_np = np.random.uniform(1, 10,
                                size=(weight_shape[0], )).astype("int32")

    ref = (relay.create_executor(
        "vm", mod=relay_mod, device=dev,
        target=target).evaluate()(*[data, weight_np, bias_np]).numpy())

    params = {"weight": weight_np, "bias": bias_np}

    if do_tune:
        extracted_tasks = extract_task_from_relay(relay_mod, target, params)

        # Filter out tasks that we don't intend to schedule / tune with TIR.
        tune_tasks = list(
            filter(
                lambda task: "dense" in task.task_name,
                extracted_tasks,
            ))
        config = TuneConfig(
            strategy="replay_trace",
            num_trials_per_iter=64,
            max_trials_per_task=20000,
            max_trials_global=20000,
        )

        with tempfile.TemporaryDirectory() as work_dir:
            # postprocs=lambda: [] is important to prevent default post processors from
            # tampering with the manual schedule.
            database = tune_extracted_tasks(
                tune_tasks,
                config,
                work_dir=work_dir,
                postprocs=lambda: [],
            )
    else:

        def schedule_fn(task, sch):
            if "dense" not in task.task_name:
                return False

            block = sch.get_block("compute")

            # Looks up schedule_rule annotation.
            # See the comment in test_tune_relay_manual_tir_vnni().
            schedule_rule = sch.get(block).annotations["schedule_rule"]

            assert "dense_vnni" in schedule_rule

            schedule_dense(block, M, False, sch)

            return True

        database = apply_fixed_schedules(relay_mod, target, params,
                                         schedule_fn)

    with ApplyHistoryBest(database):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_meta_schedule": True},
        ):
            # pylint: disable=W0105
            """
            The log should say
            Warning: Cannot find workload: tvmgen_default_fused_expand_dims
            Warning: Cannot find workload: tvmgen_default_fused_cast
            Warning: Cannot find workload: tvmgen_default_fused_cast_1
            Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul

            This means batch matmul and others are scheduled by TE, and dense (the one not warned)
            is found in the meta schedule tuning database during ApplyHistoryBest
            """
            # pylint: enable=W0105
            lib = relay.build(relay_mod, target=target, params=params)

    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))

    runtime.set_input("data", data)
    runtime.run()

    out = runtime.get_output(0).numpy()

    np.testing.assert_equal(out, ref)