示例#1
0
def main():
    log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json")
    workload_func, params = CONFIGS[ARGS.workload]
    params = params[0]  # type: ignore
    workload_func = auto_scheduler.register_workload(workload_func)

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")
    task = auto_scheduler.SearchTask(
        func=workload_func,
        args=params,
        target=ARGS.target,
        hardware_params=hardware_params,
    )
    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=ARGS.rpc_workers,
        number=3,
        repeat=1,
        min_repeat_ms=100,
        enable_cpu_cache_flush=False,
    )

    # Inspect the computational graph
    print("Computational DAG:")
    print(task.compute_dag)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=ARGS.num_trials,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        verbose=2,
        runner=runner,
    )
    print("Running AutoTuning:")
    task.tune(tune_option)
    print("History Best:")
    print(task.print_best(log_file))
    sch, args = task.apply_best(log_file)
    print("Lowered TIR:")
    print(tvm.lower(sch, args, simple_mode=True))
def _autoscheduler_test_helper(model,
                               tmpdir_name,
                               early_stopping=1,
                               prior_records=None):
    tvmc_model = tvmc.frontends.load_model(model)
    log_file = os.path.join(tmpdir_name, "autoscheduler.json")

    hardware_params = auto_scheduler.HardwareParams(num_cores=4, target="llvm")

    tvmc.tune(
        tvmc_model,
        target="llvm",
        tuning_records=log_file,
        prior_records=prior_records,
        early_stopping=early_stopping,
        enable_autoscheduler=True,
        trials=2,
        hardware_params=hardware_params,
    )

    # testing whether the log file was produced
    assert path.exists(log_file), "autoscheduler log file should exist"

    with auto_scheduler.ApplyHistoryBest(log_file) as best:
        assert isinstance(best, auto_scheduler.dispatcher.ApplyHistoryBest
                          ), "unable to load the best results of tuning"

    return log_file
示例#3
0
def generate_sketches(
    workload_func, args, target, print_for_debug=False, init_search_callbacks=None
):
    # NOTE: test_cpu_matmul_sketch and test_cpu_max_pool2d_sketch assume 4 cores to trigger all
    # possible sketch generations.
    task = auto_scheduler.SearchTask(
        func=workload_func,
        args=args,
        target=target,
        hardware_params=auto_scheduler.HardwareParams(num_cores=4, target=target),
    )
    policy = auto_scheduler.SketchPolicy(
        task, verbose=0, init_search_callbacks=init_search_callbacks
    )
    return policy.generate_sketches(print_for_debug)
示例#4
0
def drive_tune(args):
    """Invoke auto-tuning with command line arguments

    Parameters
    ----------
    args: argparse.Namespace
        Arguments from command line parser.
    """
    tvmc_model = frontends.load_model(args.FILE,
                                      args.model_format,
                                      shape_dict=args.input_shapes)

    # Specify hardware parameters, although they'll only be used if autoscheduling.
    hardware_params = auto_scheduler.HardwareParams(
        num_cores=args.num_cores,
        vector_unit_bytes=args.vector_unit_bytes,
        cache_line_bytes=args.cache_line_bytes,
        max_shared_memory_per_block=args.max_shared_memory_per_block,
        max_local_memory_per_block=args.max_local_memory_per_block,
        max_threads_per_block=args.max_threads_per_block,
        max_vthread_extent=args.max_vthread_extent,
        warp_size=args.warp_size,
        target=args.target,
        target_host=args.target_host,
    )

    if args.rpc_tracker:
        parsed_url = urlparse("//%s" % args.rpc_tracker)
        rpc_hostname = parsed_url.hostname
        rpc_port = parsed_url.port or 9090
        logger.info("RPC tracker hostname: %s", rpc_hostname)
        logger.info("RPC tracker port: %s", rpc_port)

        if not args.rpc_key:
            raise common.TVMCException(
                "need to provide an RPC tracker key (--rpc-key) for remote tuning"
            )
    else:
        rpc_hostname = None
        rpc_port = None

    tune_model(
        tvmc_model,
        args.target,
        tuning_records=args.output,
        prior_records=args.tuning_records,
        enable_autoscheduler=args.enable_autoscheduler,
        rpc_key=args.rpc_key,
        hostname=rpc_hostname,
        port=rpc_port,
        trials=args.trials,
        target_host=args.target_host,
        tuner=args.tuner,
        min_repeat_ms=args.min_repeat_ms,
        early_stopping=args.early_stopping,
        desired_layout=args.desired_layout,
        timeout=args.timeout,
        repeat=args.repeat,
        number=args.number,
        parallel=args.parallel,
        hardware_params=hardware_params,
        include_simple_tasks=args.include_simple_tasks,
        log_estimated_latency=args.log_estimated_latency,
        additional_target_options=reconstruct_target_args(args),
    )
示例#5
0
def test_gpu_feature():
    # Use records to build a complicated GPU program
    json_records = "\n".join(
        (
            """{"i": [["[\\"matmul_auto_scheduler_test\\", 512, 512, 512]", "cuda"], [[], [["CHW", 2, "local"], ["SP", 2, 0, 512, [1, 16, 32, 1], 1], ["SP", 2, 5, 512, [4, 1, 1, 16], 1], ["SP", 2, 10, 512, [1, 2], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 3, 0, 1, 3], ["FSP", 3, 4, 2, 3], ["RE", 3, [0, 4, 1, 5, 2, 6, 3, 7]], ["FU", 2, [0, 1]], ["FU", 3, [0, 1]], ["FU", 2, [1, 2]], ["FU", 3, [1, 2]], ["FU", 2, [2, 3]], ["FU", 3, [2, 3]], ["CA", 2, 3, 2], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 3], ["FU", 2, [0, 1]], ["FFSP", 2, 0, [1, 2], 1, 1], ["AN", 2, 1, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 3], ["FU", 1, [0, 1]], ["FFSP", 1, 0, [1, 2], 1, 1], ["AN", 1, 1, 6], ["AN", 5, 0, 5], ["AN", 5, 1, 4], ["AN", 5, 2, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00536798], 0, 2.49277, 1585564852], "v": "v0.1"}""",
        )
    )

    # load states
    with tempfile.NamedTemporaryFile(mode="w") as f:
        f.write(json_records)
        f.flush()
        inputs, _ = auto_scheduler.RecordReader(f.name).read_lines()

        inp = inputs[0]
        task = auto_scheduler.SearchTask(
            workload_key=inp.task.workload_key,
            target=inp.task.target,
            hardware_params=auto_scheduler.HardwareParams(
                100000, 16, 64, 1 << 30, 1 << 30, 1 << 30, 1 << 30, 1 << 30
            ),
        )

        state = task.compute_dag.infer_bound_from_state(inputs[0].state)
        fea = auto_scheduler.feature.get_per_store_features_from_states([state], task)[0]
        names = auto_scheduler.feature.get_per_store_feature_names()

        # build feature dict
        fea_dicts = []
        for i in range(len(fea)):
            tmp_dict = {}
            for j in range(len(names)):
                tmp_dict[names[j]] = fea[i][j]
            fea_dicts.append(tmp_dict)

        """
        lowered IR:

        Placeholder: A, B
        blockIdx.x [email protected]@ (0,8)
          vthread [email protected]@ (0,4)
            threadIdx.x [email protected]@ (0,16)
              C.local auto_unroll: 1024
              for k.0 (0,256)
                for ax0@[email protected] (0,8)
                  threadIdx.x ax0@[email protected] (0,16)
                    B.shared = ...
                for ax0@[email protected] (0,64)
                  threadIdx.x ax0@[email protected] (0,16)
                    A.shared = ...
                for i_c.3 (0,32)
                  for k.2 (0,2)
                    for j_c.4 (0,16)
                      C.local = ...
              for i.3 (0,32)
                for j.3 (0,16)
                  C = ...
        """

        # check gpu-related features
        assert fequal(fea_dicts[0]["blockIdx_x_len"], math.log2(8 + 1))
        assert fequal(fea_dicts[0]["vthread_len"], math.log2(4 + 1))
        assert fequal(fea_dicts[1]["threadIdx_x_len"], math.log2(16 + 1))
        assert fequal(fea_dicts[0]["threadIdx_y_len"], math.log2(1 + 1))
        assert fequal(fea_dicts[2]["blockIdx_z_len"], math.log2(1 + 1))
        assert fequal(fea_dicts[0]["is_gpu"], 1.0)
示例#6
0
def drive_tune(args):
    """Invoke auto-tuning with command line arguments

    Parameters
    ----------
    args: argparse.Namespace
        Arguments from command line parser.
    """
    # extra arguments validation before importing the model, so that obvious errors
    # are pointed in advance.
    if args.rpc_tracker:
        parsed_url = urlparse("//%s" % args.rpc_tracker)
        rpc_hostname = parsed_url.hostname
        rpc_port = parsed_url.port or 9090
        logger.info("RPC tracker hostname: %s", rpc_hostname)
        logger.info("RPC tracker port: %s", rpc_port)

        if not args.rpc_key:
            raise common.TVMCException(
                "need to provide an RPC tracker key (--rpc-key) for remote tuning"
            )

    target, extra_targets = common.target_from_cli(args.target)
    target_host = args.target_host
    target, target_host = Target.check_and_update_host_consist(
        target, target_host)
    mod, params = frontends.load_model(args.FILE,
                                       args.model_format,
                                       shape_dict=args.input_shapes)

    for codegen_from_cli in extra_targets:
        codegen = composite_target.get_codegen_by_target(
            codegen_from_cli["name"])
        partition_function = codegen["pass_pipeline"]
        mod = partition_function(mod, params, **codegen_from_cli["opts"])

    # min_repeat_ms should be:
    # a. the value provided by the user, if any, or
    # b. 0ms in case target is "cpu"; otherwise 1000ms
    if args.min_repeat_ms is not None:
        min_repeat_ms = args.min_repeat_ms
    else:
        min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000
        logger.debug("Default --min-repeat-ms for this target is %s",
                     min_repeat_ms)

    if args.rpc_tracker:
        runner_ctor = auto_scheduler.RPCRunner if args.enable_autoscheduler else autotvm.RPCRunner
        runner = runner_ctor(
            key=args.rpc_key,
            host=rpc_hostname,
            port=rpc_port,
            number=args.number,
            repeat=args.repeat,
            n_parallel=args.parallel,
            timeout=args.timeout,
            min_repeat_ms=min_repeat_ms,
        )
    else:
        logger.info("starting localhost tuning")
        runner_ctor = (auto_scheduler.LocalRunner
                       if args.enable_autoscheduler else autotvm.LocalRunner)
        runner = runner_ctor(
            number=args.number,
            repeat=args.repeat,
            timeout=args.timeout,
            min_repeat_ms=min_repeat_ms,
        )

    if args.enable_autoscheduler:
        # Specify hardware parameters
        hardware_params = auto_scheduler.HardwareParams(
            args.num_cores,
            args.vector_unit_bytes,
            args.cache_line_bytes,
            args.max_shared_memory_per_block,
            args.max_local_memory_per_block,
            args.max_threads_per_block,
            args.max_vthread_extent,
            args.warp_size,
        )
        tasks, weights = autoscheduler_get_tuning_tasks(
            mod=mod,
            params=params,
            target=target,
            alter_layout=args.desired_layout,
            hardware_params=hardware_params,
            include_simple_tasks=args.include_simple_tasks,
        )

        # Create the autoscheduler tuning options
        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=args.trials,
            measure_callbacks=[auto_scheduler.RecordToFile(args.output)],
            runner=runner,
            early_stopping=args.early_stopping,
        )

        # Schedule the tasks (i.e., produce a schedule for each task)
        schedule_tasks(tasks, weights, tuning_options, args.tuning_records,
                       args.log_estimated_latency)
    else:
        tasks = autotvm_get_tuning_tasks(
            mod=mod,
            params=params,
            target=target,
            alter_layout=args.desired_layout,
        )

        tuning_option = {
            "tuner":
            args.tuner,
            "trials":
            args.trials,
            "early_stopping":
            args.early_stopping,
            "measure_option":
            autotvm.measure_option(
                builder=autotvm.LocalBuilder(build_func="default"),
                runner=runner),
            "tuning_records":
            args.tuning_records,
        }
        logger.debug(" tuning options: %s", tuning_option)

        tune_tasks(tasks, args.output, **tuning_option)
def test_stage_order():
    """Test if the stage order is preserved when recovering a DAG."""
    N = 512
    A, B, C, D, E = parallel_matmul_auto_scheduler_test(N)
    sch = te.create_schedule([D.op, E.op])
    (D_local, ) = sch.cache_write([D], "local")
    (E_local, ) = sch.cache_write([E], "local")
    sch.cache_read(A, "shared", [D_local])
    sch.cache_read(B, "shared", [D_local])
    sch.cache_read(A, "shared", [E_local])
    sch.cache_read(C, "shared", [E_local])

    dag = auto_scheduler.ComputeDAG(sch)
    stage_ops_1 = dag.get_init_state().stage_ops

    # 3 placeholder, 4 x.shared, 2 {D,E}.local, 2 {D,E} compute
    assert len(stage_ops_1) == 11

    # Cache read stage should follow the source stage
    for idx, op in enumerate(stage_ops_1):
        if op.name == "A":
            assert (stage_ops_1[idx + 1].name == "A.d.shared"
                    and stage_ops_1[idx + 2].name == "A.shared")
        elif op.name in ["B", "C"]:
            assert stage_ops_1[idx + 1].name == "%s.shared" % op.name

    # Serialize and deserialize the ComputeDAG constructed by a schedule.
    loaded_dag = pickle.loads(pickle.dumps(dag))
    assert str(loaded_dag.get_init_state()) == str(dag.get_init_state())
    assert len(loaded_dag.get_init_state().stage_ops) == len(
        dag.get_init_state().stage_ops)

    # Apply the same schedule to Ansor state and it should have the same stage order
    dag = auto_scheduler.ComputeDAG([A, B, C, D, E])
    state = dag.get_init_state()

    D_local = state.cache_write(D, "local")
    E_local = state.cache_write(E, "local")
    state.cache_read(A, "shared", [D_local])
    state.cache_read(B, "shared", [D_local])
    state.cache_read(A, "shared", [E_local])
    state.cache_read(C, "shared", [E_local])

    stage_ops_2 = state.stage_ops
    assert len(stage_ops_1) == len(stage_ops_2)

    # Cache read stage should follow the source stage
    for op1, op2 in zip(stage_ops_1, stage_ops_2):
        assert op1.name == op2.name

    # Serialize and deserialize the ComputeDAG constructed by a list of tensor ops.
    loaded_dag = pickle.loads(pickle.dumps(dag))
    assert str(loaded_dag.get_init_state()) == str(dag.get_init_state())
    assert len(loaded_dag.get_init_state().stage_ops) == len(
        dag.get_init_state().stage_ops)

    # Serialize and deserialize the search task.
    task = auto_scheduler.SearchTask(
        dag,
        json.dumps(("test-key", )),
        tvm.target.Target("llvm"),
        hardware_params=auto_scheduler.HardwareParams(100000, 16, 64, 0, 0, 0,
                                                      0, 0),
    )

    task2 = pickle.loads(pickle.dumps(task))
    assert "test-key" in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY
    assert str(task.dag.get_init_state()) == str(task2.dag.get_init_state())
    assert len(task.dag.get_init_state().stage_ops) == len(
        task2.dag.get_init_state().stage_ops)
    assert task.workload_key == task2.workload_key
    assert str(task.target) == str(task2.target)
    assert task.hardware_params.num_cores == task2.hardware_params.num_cores
    assert task.hardware_params.vector_unit_bytes == task2.hardware_params.vector_unit_bytes
    assert task.hardware_params.cache_line_bytes == task2.hardware_params.cache_line_bytes
示例#8
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
        timeout=ARGS.rpc_config.session_timeout_sec,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.model_name}")
    onnx_model = onnx.load(ARGS.onnx_path)
    shape_dict = {}
    for item in ARGS.input_shape:
        print(f"  input_name : {item['name']}")
        print(f"  input_shape: {item['shape']}")
        print(f"  input_dtype: {item['dtype']}")
        shape_dict[item["name"]] = item["shape"]
    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
    input_data = {
        item["name"]: generate_input_data(item["shape"], item["dtype"])
        for item in ARGS.input_shape
    }

    with ms.Profiler() as profiler:
        tasks, task_weights = auto_scheduler.extract_tasks(
            mod["main"],
            params,
            target=ARGS.target,
            hardware_params=hardware_params,
        )
        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
            print(f"==== Task {idx}: {task.desc} "
                  f"(weight {task_weight} key: {task.workload_key}) =====")
            print(task.compute_dag)

        if ARGS.num_trials > 0:
            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
            tuner.tune(
                auto_scheduler.TuningOptions(
                    num_measure_trials=ARGS.num_trials,
                    runner=runner,
                    measure_callbacks=[
                        auto_scheduler.RecordToFile(log_file),
                    ],
                ),
                adaptive_training=ARGS.adaptive_training,
            )

        relay_build = {
            "graph": relay.build,
            "vm": relay.vm.compile
        }[ARGS.backend]
        with auto_scheduler.ApplyHistoryBest(log_file):
            with tvm.transform.PassContext(
                    opt_level=3,
                    config={"relay.backend.use_auto_scheduler": True},
            ):
                lib = relay_build(
                    mod,
                    target=ARGS.target,
                    params=params,
                )
    print("Tuning Time:")
    print(profiler.table())

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=lib,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=create_timer(ARGS.backend),
        backend=ARGS.backend,
    )
示例#9
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(
                ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(
                ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.model_name}")
    onnx_model = onnx.load(ARGS.onnx_path)
    shape_dict = {}
    for item in ARGS.input_shape:
        print(f"  input_name: {item['name']}")
        print(f"  input_shape: {item['shape']}")
        print(f"  input_dtype: {item['dtype']}")
        shape_dict[item["name"]] = item["shape"]
    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod["main"],
        params,
        target=ARGS.target,
        hardware_params=hardware_params,
    )
    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
        print(
            f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) ====="
        )
        print(task.compute_dag)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tuner.tune(
        auto_scheduler.TuningOptions(
            num_measure_trials=ARGS.num_trials,
            runner=runner,
            measure_callbacks=[
                auto_scheduler.RecordToFile(log_file),
            ],
        ))

    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(
                opt_level=3,
                config={"relay.backend.use_auto_scheduler": True},
        ):
            lib = relay.build(
                mod,
                target=ARGS.target,
                params=params,
            )
    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
    input_data = {}
    for item in ARGS.input_shape:
        input_name, input_shape, input_dtype = item["name"], item[
            "shape"], item["dtype"]
        if input_dtype.startswith("float"):
            input_data[input_name] = np.random.uniform(
                size=input_shape).astype(input_dtype)
        else:
            input_data[input_name] = np.random.randint(low=0,
                                                       high=10000,
                                                       size=input_shape,
                                                       dtype=input_dtype)

    def f_timer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.graph_executor import GraphModule

        # pylint: enable=import-outside-toplevel

        mod = GraphModule(rt_mod["default"](dev))
        for input_name, input_value in input_data.items():
            mod.set_input(input_name, input_value)
        ftimer = mod.module.time_evaluator(
            "run",
            dev,
            min_repeat_ms=500,
            repeat=3,
        )
        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
        print("Running time in time_evaluator: ", results)

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=lib,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=f_timer,
    )

    def f_per_layer(rt_mod, dev, input_data):
        # pylint: disable=import-outside-toplevel
        from tvm.contrib.debugger.debug_executor import create

        # pylint: enable=import-outside-toplevel
        mod = create(graph, rt_mod, dev)
        for input_name, input_value in input_data.items():
            mod.set_input(input_name, input_value)
        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
        graph_time = mod.run_individual(number=10,
                                        repeat=1,
                                        min_repeat_ms=5000)
        print("|graph_nodes| = ", len(graph_nodes))
        print("|graph_time| = ", len(graph_time))
        graph_nodes_time = {
            k: float(v)
            for k, v in zip(graph_nodes, graph_time)
        }
        for k, v in graph_nodes_time.items():
            print(f"{k} : {v:.3f}")

    run_module_via_rpc(
        rpc_config=ARGS.rpc_config,
        lib=rt_mod,
        dev_type=ARGS.target.kind.name,
        args=input_data,
        continuation=f_per_layer,
    )
示例#10
0
def main():
    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")

    runner = auto_scheduler.RPCRunner(
        key=ARGS.rpc_key,
        host=ARGS.rpc_host,
        port=ARGS.rpc_port,
        n_parallel=cpu_count(logical=True),
        number=ARGS.number,
        repeat=ARGS.repeat,
        min_repeat_ms=ARGS.min_repeat_ms,
        enable_cpu_cache_flush=ARGS.cpu_flush,
        timeout=ARGS.rpc_config.session_timeout_sec,
    )

    if ARGS.target.kind.name == "llvm":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=int(ARGS.target.attrs["num-cores"]),
            target=ARGS.target,
        )
    elif ARGS.target.kind.name == "cuda":
        hardware_params = auto_scheduler.HardwareParams(
            num_cores=-1,
            vector_unit_bytes=16,
            cache_line_bytes=64,
            max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
            max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
            # The value `max_local_memory_per_block` is not used in AutoScheduler,
            # but is required by the API.
            max_local_memory_per_block=12345678,
            max_vthread_extent=8,
            warp_size=32,
        )
    else:
        raise NotImplementedError(f"Unsupported target {ARGS.target}")

    describe()
    print(f"Workload: {ARGS.workload}")
    with ms.Profiler() as profiler:
        # Same as MetaSchedule Tune TE
        # Does not count ApplyHistoryBest time

        workload_func, params = CONFIGS[ARGS.workload]
        params = params[0]  # type: ignore
        workload_func = auto_scheduler.register_workload(workload_func)

        task = auto_scheduler.SearchTask(
            func=workload_func,
            args=params,
            target=ARGS.target,
            hardware_params=hardware_params,
        )
        # Inspect the computational graph
        print("Computational DAG:")
        print(task.compute_dag)
        tune_option = auto_scheduler.TuningOptions(
            num_measure_trials=ARGS.num_trials,
            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
            verbose=2,
            runner=runner,
        )
        if ARGS.num_trials > 0:
            print("Running AutoTuning:")
            task.tune(tune_option, adaptive_training=ARGS.adaptive_training)

    print("Tuning Time:")
    print(profiler.table())

    print("History Best:")
    print(task.print_best(log_file))

    sch, args = task.apply_best(log_file)
    print("Lowered TIR:")
    print(tvm.lower(sch, args, simple_mode=True))