Exemplo n.º 1
0
    def __setstate__(self, state):
        # Register the workload if needed
        try:
            workload = json.loads(state["workload_key"])
        except Exception:  # pylint: disable=broad-except
            raise RuntimeError("Invalid workload key %s" %
                               state["workload_key"])

        # workload[0] is either the compute function name or the ComputeDAG hash.
        # The compute functions are already registered when importing TVM, so here
        # we only register the ComputeDAG workloads. If the same workload has
        # already been registered, the later registration overrides the prvious one.
        if workload[0] not in WORKLOAD_FUNC_REGISTRY:
            register_workload_tensors(state["workload_key"],
                                      state["compute_dag"].tensors)

        state["target"], state["target_host"] = Target.canon_target_and_host(
            state["target"], state["target_host"])
        self.__init_handle_by_constructor__(
            _ffi_api.SearchTask,
            state["compute_dag"],
            state["workload_key"],
            state["target"],
            state["target"].host,
            state["hardware_params"],
            state["layout_rewrite_option"],
            state["task_input_names"],
            state["desc"],
        )
Exemplo n.º 2
0
def get_sample_task(target=tvm.target.cuda(), target_host=None):
    """return a sample task for testing"""
    target, target_host = Target.canon_target_and_host(target, target_host)
    task = autotvm.task.create("testing/conv2d_no_batching",
                               args=(1, 7, 7, 512, 512, 3, 3),
                               target=target)
    return task, target
Exemplo n.º 3
0
def _build_func_common(measure_input,
                       runtime=None,
                       check_gpu=None,
                       build_option=None):
    """Common part for building a configuration"""
    target, task, config = measure_input
    target, task.target_host = Target.canon_target_and_host(
        target, task.target_host)

    with target:
        s, args = task.instantiate(config)

        # check invalidity of template and code hash consistency
        if not config.valid():
            raise InstantiationError(config.errors)

        opts = build_option or {}
        if check_gpu:  # Add verify pass to filter out invalid configs in advance.
            opts["tir.add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))]

        # if target is vta, we need to use vta build
        if (hasattr(measure_input.target, "device_name")
                and measure_input.target.device_name == "vta"):
            # pylint: disable=import-outside-toplevel
            import vta

            func = vta.build(s, args, target_host=task.target_host)
        else:
            with tvm.ir.transform.PassContext(config=opts):
                func = build(s,
                             args,
                             target_host=task.target_host,
                             runtime=runtime)
    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
Exemplo n.º 4
0
def main():
    """Main function"""
    parser = argparse.ArgumentParser()
    parser.add_argument("--target",
                        type=str,
                        default="llvm",
                        help="The build target")
    parser.add_argument("--target-host",
                        type=str,
                        default=None,
                        help="The host code compilation target")
    parser.add_argument("--rpc-host",
                        type=str,
                        default="127.0.0.1",
                        help="the hostname of the server")
    parser.add_argument("--rpc-port",
                        type=int,
                        default=9090,
                        help="The port of the RPC")

    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO)

    args.target, args.target_host = Target.canon_target_and_host(
        args.target, args.target_host)
    measure_peak_all(args.target, args.target_host, args.rpc_host,
                     args.rpc_port)
Exemplo n.º 5
0
def test_canon_target_and_host_1():
    target = None
    host = "llvm"
    with pytest.raises(
            AssertionError,
            match=r"Target host is not empty when target is empty."):
        target, host = Target.canon_target_and_host(target, host)
Exemplo n.º 6
0
    def __init__(
        self,
        func=None,
        args=None,
        compute_dag=None,
        workload_key=None,
        target=None,
        target_host=None,
        hardware_params=None,
        layout_rewrite_option=None,
        task_inputs=None,
        task_inputs_overwrite=False,
        task_inputs_save_to_file=False,
        desc="",
    ):
        assert (
            func is not None or workload_key is not None
        ), "Either a workload generation function or a workload key should be provided"

        if func is not None:
            workload_key = make_workload_key(func, args)
        if compute_dag is None:
            compute_dag = ComputeDAG(workload_key)

        assert target is not None, "Must specify a target."

        target, target_host = Target.canon_target_and_host(target, target_host)

        if layout_rewrite_option is None:
            layout_rewrite_option = LayoutRewriteOption.get_target_default(
                target)

        task_input_names = []
        if isinstance(task_inputs, list):
            task_input_names = task_inputs
        elif isinstance(task_inputs, dict):
            for input_name in task_inputs:
                register_task_input_buffer(
                    workload_key,
                    input_name,
                    task_inputs[input_name],
                    task_inputs_overwrite,
                    task_inputs_save_to_file,
                )
                task_input_names.append(input_name)
        elif task_inputs is not None:
            raise ValueError("task_inputs should be a dict or a list.")

        self.__init_handle_by_constructor__(
            _ffi_api.SearchTask,
            compute_dag,
            workload_key,
            target,
            target_host,
            hardware_params,
            layout_rewrite_option,
            task_input_names,
            desc,
        )
Exemplo n.º 7
0
def test_canon_target_and_host_3():
    target = Target(target="cuda", host="llvm")
    host = None
    target, host = Target.canon_target_and_host(target, host)
    assert target.kind.name == "cuda"
    assert target.host.kind.name == "llvm"
    assert host.kind.name == "llvm"
    assert target.host == host
Exemplo n.º 8
0
    def __setstate__(self, state):
        import cloudpickle  # pylint: disable=import-outside-toplevel

        self.name = state["name"]
        self.args = state["args"]
        self.kwargs = state["kwargs"]
        self.config_space = state["config_space"]
        self.func = cloudpickle.loads(state["func"])
        self.flop = state["flop"]
        self.target, self.target_host = Target.canon_target_and_host(
            state["target"], state["target_host"])
Exemplo n.º 9
0
def _build_func_common(measure_input,
                       runtime=None,
                       check_gpu=None,
                       build_option=None):
    """Common part for building a configuration"""
    target, task, config = measure_input
    target, task.target_host = Target.canon_target_and_host(
        target, task.target_host)

    with target:
        s, args = task.instantiate(config)

        # check invalidity of template and code hash consistency
        if not config.valid():
            raise InstantiationError(config.errors)

        # if target is vta, we need to use vta build
        if (hasattr(measure_input.target, "device_name")
                and measure_input.target.device_name == "vta"):
            # pylint: disable=import-outside-toplevel
            import vta

            func = vta.build(s, args, target_host=task.target_host)
        else:
            current_pass_context: tvm.ir.transform.PassContext = (
                tvm.ir.transform.PassContext.current())
            current_config = dict(current_pass_context.config)
            if build_option is not None:
                current_config.update(build_option)

            if "tir.add_lower_pass" in current_config:
                current_add_lower_pass = list(
                    current_config["tir.add_lower_pass"])
            else:
                current_add_lower_pass = []
            if check_gpu:
                current_add_lower_pass.append(
                    (2, gpu_verify_pass(**check_gpu)))
            current_config["tir.add_lower_pass"] = current_add_lower_pass

            with tvm.ir.transform.PassContext(
                    opt_level=current_pass_context.opt_level,
                    required_pass=current_pass_context.required_pass,
                    disabled_pass=current_pass_context.disabled_pass,
                    instruments=current_pass_context.instruments,
                    config=current_config,
            ):
                func = build(s,
                             args,
                             target_host=task.target_host,
                             runtime=runtime)
    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
Exemplo n.º 10
0
 def __getstate__(self):
     self.target, self.target_host = Target.canon_target_and_host(
         self.target, self.target_host)
     return {
         "compute_dag": self.compute_dag,
         "workload_key": self.workload_key,
         "target": self.target,
         "target_host": self.target_host,
         "hardware_params": self.hardware_params,
         "layout_rewrite_option": self.layout_rewrite_option,
         "task_input_names": self.task_input_names,
         "desc": self.desc,
     }
Exemplo n.º 11
0
def autoscheduler_get_tuning_tasks(
    mod: tvm.IRModule,
    params: Dict[str, tvm.nd.NDArray],
    target: str,
    target_host: Optional[str] = None,
    alter_layout: Optional[str] = None,
    hardware_params: Optional[HardwareParams] = None,
    include_simple_tasks: bool = False,
):
    """Get the autoscheduler tuning tasks for a given relay module.

    Parameters
    ----------
    mod : tvm.IRModule
        The relay module from which to extract tuning tasks.
    params : dict
        The params for the relay module.
    target : tvm.target.Target
        The compilation target.
    target_host : str, optional
        The compilation target for the host.
    alter_layout : str, optional
        The layout to convert the graph to. Note, the convert layout
        pass doesn't currently guarantee the whole of the graph will
        be converted to the chosen layout.
    hardware_params : Optional[HardwareParams]
        Hardware parameters used for the search tasks

    Returns
    -------
    tasks : list of autotvm.Tasks
        list of tasks to be tuned
    weights : List[int]
        the weight (i.e. the number of appearance) of extracted tasks
    """
    target, target_host = Target.canon_target_and_host(target, target_host)

    if alter_layout:
        mod = convert_graph_layout(mod, alter_layout)

    # Extract the tasks
    tasks, task_weights = auto_scheduler.extract_tasks(
        mod["main"],
        params,
        target=target,
        hardware_params=hardware_params,
        include_simple_tasks=include_simple_tasks,
    )

    return tasks, task_weights
Exemplo n.º 12
0
    def __getstate__(self):
        # custom pickle implementation is required for
        # some unpickable local task functions.
        # So we only pickle the name of the function
        # and restore the function by name when unpickling it.
        import cloudpickle  # pylint: disable=import-outside-toplevel

        self.target, self.target_host = Target.canon_target_and_host(
            self.target, self.target_host)
        return {
            "name": self.name,
            "args": self.args,
            "kwargs": self.kwargs,
            "config_space": self.config_space,
            "flop": self.flop,
            "target": self.target,
            "target_host": self.target_host,
            "func": cloudpickle.dumps(self.func),
        }
Exemplo n.º 13
0
Arquivo: measure.py Projeto: were/tvm
def _local_build_worker(inp_serialized, build_func, verbose):
    tic = time.time()
    inp = MeasureInput.deserialize(inp_serialized)
    task = inp.task
    task.target, task.target_host = Target.canon_target_and_host(
        task.target, task.target_host)

    error_no = MeasureErrorNo.NO_ERROR
    error_msg = None
    args = []

    try:
        sch, args = task.compute_dag.apply_steps_from_state(
            inp.state, layout_rewrite=task.layout_rewrite_option)
    # pylint: disable=broad-except
    except Exception:
        error_no = MeasureErrorNo.INSTANTIATION_ERROR
        error_msg = make_traceback_info()

    if error_no == 0:
        dirname = tempfile.mkdtemp()
        filename = os.path.join(dirname,
                                "tmp_func." + build_func.output_format)

        try:
            with transform.PassContext():
                func = build_module.build(sch, args, target=task.target)
            func.export_library(filename, build_func)
        # pylint: disable=broad-except
        except Exception:
            error_no = MeasureErrorNo.COMPILE_HOST
            error_msg = make_traceback_info()
    else:
        filename = ""

    if verbose >= 1:
        if error_no == MeasureErrorNo.NO_ERROR:
            print(".", end="", flush=True)
        else:
            print(".E", end="", flush=True)  # Build error

    return filename, args, error_no, error_msg, time.time() - tic
Exemplo n.º 14
0
def autotvm_get_tuning_tasks(
    mod: tvm.IRModule,
    params: Dict[str, tvm.nd.NDArray],
    target: str,
    target_host: Optional[str] = None,
    alter_layout: Optional[str] = None,
):
    """Get the autotvm tuning tasks for a given relay module.

    Parameters
    ----------
    mod : tvm.IRModule
        The relay module from which to extract tuning tasks.
    params : dict
        The params for the relay module.
    target : tvm.target.Target
        The compilation target.
    target_host : str, optional
        The compilation target for the host.
    alter_layout : str, optional
        The layout to convert the graph to. Note, the convert layout
        pass doesn't currently guarantee the whole of the graph will
        be converted to the chosen layout.

    Returns
    -------
    tasks : list of autotvm.Tasks
        list of tasks to be tuned
    """
    target, target_host = Target.canon_target_and_host(target, target_host)

    if alter_layout:
        mod = convert_graph_layout(mod, alter_layout)

    tasks = autotvm.task.extract_from_program(
        mod["main"],
        target=target,
        params=params,
    )

    return tasks
Exemplo n.º 15
0
Arquivo: measure.py Projeto: were/tvm
def recover_measure_input(inp, rebuild_state=False):
    """
    Recover a deserialized MeasureInput by rebuilding the missing fields.
    1. Rebuid the compute_dag in inp.task
    2. (Optional) Rebuild the stages in inp.state

    Parameters
    ----------
    inp: MeasureInput
        The deserialized MeasureInput
    rebuild_state: bool = False
        Whether rebuild the stages in MeasureInput.State

    Returns
    -------
    new_input: MeasureInput
        The fully recovered MeasureInput with all fields rebuilt.
    """
    # pylint: disable=import-outside-toplevel
    from .search_task import SearchTask  # lazily import to avoid recursive dependency

    task = inp.task
    task.target, task.target_host = Target.canon_target_and_host(
        task.target, task.target_host)
    new_task = SearchTask(
        workload_key=task.workload_key,
        target=task.target,
        hardware_params=task.hardware_params,
        layout_rewrite_option=task.layout_rewrite_option,
        task_inputs=list(task.task_input_names),
    )

    if rebuild_state:
        new_state = new_task.compute_dag.infer_bound_from_state(inp.state)
    else:
        new_state = inp.state

    return MeasureInput(new_task, new_state)
Exemplo n.º 16
0
def create(task_name, args, target, target_host=None):
    """Create a tuning task and initialize its search space

    Parameters
    ----------
    task_name : str
        The AutoTVM task name
    args : List
        Positional arguments
    target : Target
        The compilation target
    target_host: Target, optional
        The compilation target for host side

    Returns
    -------
    tsk: Task
        a task object
    """
    args = serialize_args(args)
    ret = Task(task_name, args)

    target, target_host = Target.canon_target_and_host(target, target_host)

    # init config space
    ret.config_space = ConfigSpace()

    ctx = ApplyConfig(ret.config_space)
    with ctx:
        with target:
            sch, _ = ret.func(*args)
            ret.config_space.code_hash = getattr(sch, "code_hash", None)

    ret.flop = ret.config_space.flop or compute_flop(sch)
    ret.target = target
    ret.target_host = target_host

    return ret
Exemplo n.º 17
0
def compile_model(
    tvmc_model: TVMCModel,
    target: str,
    opt_level: int = 3,
    executor: Optional[Executor] = Executor("graph"),
    runtime: Optional[Runtime] = Runtime("cpp"),
    tuning_records: Optional[str] = None,
    package_path: Optional[str] = None,
    cross: Optional[Union[str, Callable]] = None,
    cross_options: Optional[str] = None,
    output_format: str = "so",
    dump_code: Optional[List[str]] = None,
    target_host: Optional[str] = None,
    desired_layout: Optional[str] = None,
    disabled_pass: Optional[str] = None,
    pass_context_configs: Optional[List[str]] = None,
    additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
    use_vm: bool = False,
    mod_name: Optional[str] = "default",
):
    """Compile a model from a supported framework into a TVM module.

    This function takes a union of the arguments of both frontends.load_model
    and compiler.compile_relay. The resulting TVM module can be executed using
    the graph executor.

    Parameters
    ----------
    tvmc_model : TVMCModel
        The model object that should be compiled.
    target : str
        The target for which to compile. Can be a plain string or
        a path.
    opt_level : int
        The option that controls various sorts of optimizations.
    tuning_records : str
        A path to tuning records produced using tvmc.tune. When provided,
        compilation will use more optimized kernels leading to better results.
    package_path : str, optional
        The path to export the compiled model to. If not provided it will
        be saved in a temporary directory.
    cross : str or callable object, optional
        Function that performs the actual compilation
    cross_options : str, optional
        Command line options to be passed to the cross compiler.
    output_format : str
        What format to use when saving the function library. Must be one of "so" or "tar".
        When compiling for a remote device without a cross compiler, "tar" will likely work better.
    dump_code : list, optional
        Dump the generated code for the specified source types, on
        the requested target.
    target_host : str, optional
        The target of the host machine if host-side code
        needs to be generated.
    desired_layout: str, optional
        The layout to convert the graph to. Note, the convert layout
        pass doesn't currently guarantee the whole of the graph will
        be converted to the chosen layout.
    disabled_pass: str, optional
        Comma-separated list of passes which needs to be disabled
        during compilation
    pass_context_configs: list[str], optional
        List of strings containing a set of configurations to be passed to the
        PassContext.
    additional_target_options: Optional[Dict[str, Dict[str, Any]]]
        Additional target options in a dictionary to combine with initial Target arguments
    use_vm: bool
        Whether to use the VM to compile the model as opposed to the graph executor
    mod_name: str, optional
        The module name

    Returns
    -------
    compiled_model : TVMCPackage
        The compiled TVMCModel ready to be run.

    """
    mod, params = tvmc_model.mod, tvmc_model.params

    config = parse_configs(pass_context_configs)

    if desired_layout:
        mod = convert_graph_layout(mod, desired_layout)

    tvm_target, extra_targets = target_from_cli(target, additional_target_options)
    tvm_target, target_host = Target.canon_target_and_host(tvm_target, target_host)

    for codegen_from_cli in extra_targets:
        codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
        partition_function = codegen["pass_pipeline"]

        if codegen["config_key"] is not None:
            config[codegen["config_key"]] = codegen_from_cli["opts"]
        with tvm.transform.PassContext(config=config):
            mod = partition_function(mod, params, mod_name=mod_name, **codegen_from_cli["opts"])

    if tuning_records and os.path.exists(tuning_records):
        logger.debug("tuning records file provided: %s", tuning_records)

        use_autoscheduler = True
        try:
            auto_scheduler.load_records(tuning_records)
        except tvm._ffi.base.TVMError:
            use_autoscheduler = False

        if use_autoscheduler:
            with auto_scheduler.ApplyHistoryBest(tuning_records):
                config["relay.backend.use_auto_scheduler"] = True
                with tvm.transform.PassContext(
                    opt_level=opt_level, config=config, disabled_pass=disabled_pass
                ):
                    logger.debug("building relay graph with autoscheduler")
                    graph_module = build(
                        mod,
                        tvm_target=tvm_target,
                        executor=executor,
                        runtime=runtime,
                        params=params,
                        use_vm=use_vm,
                        mod_name=mod_name,
                    )
        else:
            with autotvm.apply_history_best(tuning_records):
                with tvm.transform.PassContext(
                    opt_level=opt_level, config=config, disabled_pass=disabled_pass
                ):
                    logger.debug("building relay graph with tuning records")
                    graph_module = build(
                        mod,
                        tvm_target=tvm_target,
                        executor=executor,
                        runtime=runtime,
                        params=params,
                        use_vm=use_vm,
                        mod_name=mod_name,
                    )
    else:
        with tvm.transform.PassContext(
            opt_level=opt_level, config=config, disabled_pass=disabled_pass
        ):
            logger.debug("building relay graph (no tuning records provided)")
            graph_module = build(
                mod,
                tvm_target=tvm_target,
                executor=executor,
                runtime=runtime,
                params=params,
                use_vm=use_vm,
                mod_name=mod_name,
            )

    # Generate output dump files with sources
    if dump_code is None:
        dump_code = []
    if not isinstance(dump_code, list):
        dump_code = [dump_code]
    dumps = {}
    for source_type in dump_code:
        if use_vm:
            lib = graph_module.lib
        else:
            lib = graph_module.get_lib()
        # TODO lib.get_source call have inconsistent behavior for unsupported
        #      formats (@leandron).
        source = str(mod) if source_type == "relay" else lib.get_source(source_type)
        dumps[source_type] = source

    # Create a new tvmc model package object from the graph definition.
    package_path = tvmc_model.export_package(
        graph_module, package_path, cross, cross_options, output_format
    )

    # Write dumps to file.
    if dumps:
        save_dumps(package_path, dumps)

    return TVMCPackage(package_path)
Exemplo n.º 18
0
def test_canon_target_and_host_2():
    target = Target("cuda")
    host = Target("llvm")
    target, host = Target.canon_target_and_host(target, host)
    assert target.kind.name == "cuda"
    assert target.host.kind.name == "llvm"
Exemplo n.º 19
0
def test_canon_target_and_host_0():
    target = None
    host = None
    target, host = Target.canon_target_and_host(target, host)
    assert target is None
    assert host is None
Exemplo n.º 20
0
def extract_tasks(
    mod,
    params,
    target,
    target_host=None,
    hardware_params=None,
    include_simple_tasks=False,
    dump_workload_to_dag_log=None,
    opt_level=3,
):
    """Extract tuning tasks from a relay program.

    Parameters
    ----------
    mod: tvm.IRModule or relay.function.Function
        The module or function to tune
    params: dict of str to numpy array
        The associated parameters of the program
    target: Union[tvm.target.Target, str]
        The compilation target
    target_host: Optional[Union[tvm.target.Target, str]]
        The host compilation target
    hardware_params : Optional[HardwareParams]
        Hardware parameters used for the search tasks
    include_simple_tasks: bool
        Whether to extract simple tasks that do not include complicated ops.
    dump_workload_to_dag_log: Optional[str]
        A file to dump an association between the workload keys and the actual DAG
    opt_level : Optional[int]
        The optimization level of the task extractions.

    Returns
    -------
    tasks: List[SearchTask]
        The tasks in this network
    weights: List[int]
        The weight (i.e. the number of appearance) of extracted tasks
    """
    # pylint: disable=import-outside-toplevel
    target, target_host = Target.canon_target_and_host(target, target_host)

    # Run the compiler to collect all TOPI calls during compilation.
    env = TracingEnvironment(
        TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY
    )

    dispatch_ctx = DispatchContext.current
    old_verbose = dispatch_ctx.verbose
    dispatch_ctx.verbose = 0

    errors = []
    with env:
        # Wrap build call in a new thread to avoid the conflict
        # between python's multiprocessing and tvm's thread pool
        build_thread = threading.Thread(
            target=call_all_topi_funcs, args=(mod, params, target, errors, opt_level)
        )
        build_thread.start()
        build_thread.join()

    if errors:
        error_strings = ["Task extraction had the following errors:"] + errors
        raise TVMError("\n".join(error_strings))

    dispatch_ctx.verbose = old_verbose

    # create search tasks
    tasks = []
    weights = []
    for wkl_key, (weight, func_names) in env.wkl_key_to_weight.items():
        tasks.append(
            SearchTask(
                workload_key=wkl_key,
                target=target,
                hardware_params=hardware_params,
                # When auto scheduler is used in end to end network, try to apply layout rewrite
                # to improve the overall performance
                layout_rewrite_option=LayoutRewriteOption.get_target_default(target, True),
                task_inputs=(
                    env.wkl_key_to_input_names[wkl_key]
                    if wkl_key in env.wkl_key_to_input_names
                    else None
                ),
                task_inputs_save_to_file=True,
                desc=",".join(func_names),
            )
        )
        weights.append(int(weight))

    if dump_workload_to_dag_log is not None:
        with open(dump_workload_to_dag_log, "w") as f:
            json.dump({task.workload_key: str(task.compute_dag) for task in tasks}, f)

    return tasks, weights
Exemplo n.º 21
0
def tune_model(
    tvmc_model: TVMCModel,
    target: str,
    tuning_records: Optional[str] = None,
    prior_records: Optional[str] = None,
    enable_autoscheduler: bool = False,
    rpc_key: Optional[str] = None,
    hostname: Optional[str] = None,
    port: Optional[Union[int, str]] = 9090,
    trials: int = 10000,
    target_host: Optional[str] = None,
    tuner: str = "xgb",
    min_repeat_ms: Optional[int] = None,
    early_stopping: Optional[int] = None,
    desired_layout: Optional[str] = None,
    timeout: int = 10,
    repeat: int = 1,
    number: int = 10,
    parallel: int = 4,
    hardware_params: Optional[HardwareParams] = None,
    include_simple_tasks: bool = False,
    log_estimated_latency: bool = False,
    additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
):
    """Use tuning to automatically optimize the functions in a model.

    Parameters
    ----------
    tvmc_model : TVMCModel
        The model to be optimized.
    target : str
        Compilation target as plain string, inline JSON or path to a JSON file.
    tuning_records: str, optional
        The path to a file that tuning results will be saved to. If not specified,
        a temporary file will be used.
    prior_records: str, optional
        A path to previous tuning results that will be used to hot-start the tuning
        cost model if provided.
    enable_autoscheduler : bool, optional
        When true, use autoscheduling rather than autotvm. This should produce
        faster kernels for compatible model-target pairs.
    rpc_key : str, optional
        The RPC tracker key of the target device. Required when rpc_tracker is provided.
    hostname : str, optional
        The IP address of an RPC tracker, used when benchmarking remotely.
    port : int or str, optional
        The port of the RPC tracker to connect to. Defaults to 9090.
    trials : int, optional
        The number of schedules to try out for the entire model. Note that the default
        value is chosen as a decent average for most models, but larger models may need
        more trials to reach a good result while smaller models will converge with fewer
        trials.
    tuner : str, optional
        The type of tuner to use when tuning with autotvm. Can be one of
        "ga", "gridsearch", "random", "xgb", "xgb_knob", and "xgb-rank".
    min_repeat_ms : int, optional
        Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets.
    early_stopping : int, optional
        When specified, stop tuning after this number of trials if results aren't improving.
    desired_layout : str, optional
        Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph
        will have their layout set to this format. Tasks will then be tuned using this
        specified layout.
    timeout : int, optional,
        If a kernel trial lasts longer than this duration in seconds, it will be
        considered a failure.
    repeat : int, optional
        How many times each measurement should be repeated.
    number : int, optional
        The number of runs a single repeat is made of.
    parallel : int, optional
        The maximum number of parallel devices to use when tuning.
    hardware_params : auto_scheduler.HardwareParams, optional
        When using the autoscheduler, this object defines the configuration of the target hardware.
    include_simple_tasks : bool, optional
        Whether to extract simple operations or only computationally intensive ones when using
        the autoscheduler.
    log_estimated_latency : bool, optional
        If using the autoscheduler, write the estimated latency at each step of tuning to file.
    additional_target_options: Optional[Dict[str, Dict[str, Any]]]
        Additional target options in a dictionary to combine with initial Target arguments

    Returns
    -------
    tuning_records : str
        The path to the produced tuning log file.
    """
    target, extra_targets = target_from_cli(target, additional_target_options)
    target, target_host = Target.canon_target_and_host(target, target_host)
    # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source
    # model is fixed. For now, creating a clone avoids the issue.
    mod = deepcopy(tvmc_model.mod)
    params = tvmc_model.params
    if tuning_records is None:
        tuning_records = tvmc_model.default_tuning_records_path()

    for codegen_from_cli in extra_targets:
        codegen = composite_target.get_codegen_by_target(
            codegen_from_cli["name"])
        partition_function = codegen["pass_pipeline"]
        mod = partition_function(mod, params, **codegen_from_cli["opts"])

    # min_repeat_ms should be:
    # a. the value provided by the user, if any, or
    # b. 0ms in case target is "cpu"; otherwise 1000ms
    if min_repeat_ms is None:
        min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000
        logger.info("Default --min-repeat-ms for this target is %s",
                    min_repeat_ms)

    if rpc_key:
        if hostname is None or port is None:
            raise TVMCException(
                "You must provide a hostname and port to connect to a remote RPC device."
            )
        if isinstance(port, str):
            port = int(port)

        logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key,
                    hostname, port)

        runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner
        runner = runner_ctor(
            key=rpc_key,
            host=hostname,
            port=port,
            number=number,
            repeat=repeat,
            n_parallel=parallel,
            timeout=timeout,
            min_repeat_ms=min_repeat_ms,
        )
    else:
        logger.info("Starting localhost tuning.")
        runner_ctor = (auto_scheduler.LocalRPCMeasureContext
                       if enable_autoscheduler else autotvm.LocalRunner)
        local_server = runner_ctor(
            number=number,
            repeat=repeat,
            timeout=timeout,
            min_repeat_ms=min_repeat_ms,
        )

        # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object.
        if enable_autoscheduler:
            runner = local_server.runner
        else:
            runner = local_server

    if enable_autoscheduler:

        tasks, weights = autoscheduler_get_tuning_tasks(
            mod=mod,
            params=params,
            target=target,
            alter_layout=desired_layout,
            hardware_params=hardware_params,
            include_simple_tasks=include_simple_tasks,
        )

        # Create the autoscheduler tuning options
        tuning_options = auto_scheduler.TuningOptions(
            num_measure_trials=trials,
            measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)],
            runner=runner,
            early_stopping=early_stopping,
        )

        logger.info("Autoscheduling with configuration: %s", tuning_options)

        # Schedule the tasks (i.e., produce a schedule for each task)
        schedule_tasks(tasks, weights, tuning_options, prior_records,
                       log_estimated_latency)
    else:
        tasks = autotvm_get_tuning_tasks(
            mod=mod,
            params=params,
            target=target,
            alter_layout=desired_layout,
        )

        # In autotvm, trials is specified per task. We can convert the per-model input
        # provided to per-task trials by dividing by the number of tasks.
        trials = int(trials / max(len(tasks), 1))
        logger.info("Autotuning with %d trials per task.", trials)

        tuning_options = {
            "tuner":
            tuner,
            "trials":
            trials,
            "early_stopping":
            early_stopping,
            "measure_option":
            autotvm.measure_option(
                builder=autotvm.LocalBuilder(build_func="default"),
                runner=runner),
            "tuning_records":
            prior_records,
        }
        logger.info("Autotuning with configuration: %s", tuning_options)

        tune_tasks(tasks, tuning_records, **tuning_options)

    return tuning_records
Exemplo n.º 22
0
    def benchmark_layout_transform(
        self,
        min_exec_num=100,
        timeout=10,
        use_rpc=False,
        device_key=None,
        host="127.0.0.1",
        port=9190,
        n_parallel=1,
        build_func="default",
        layout_records=None,
        target_host=None,
        infer_layout=False,
        runner=None,
    ):
        """Benchmark all possible layout transformation in the graph,
        given a set of schedule candidates for each workload of target operator.

        Parameters
        ----------
        min_exec_num : int, optional
            Minimum number of execution. Final execution time is the average of
            all execution time.

        timeout : int, optional
            Time out for each execution.

        use_rpc : boolean, optional
            Whether to use rpc mode for benchmarking.

        device_key : str, optional
            Remote device key which can be queried by
            python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190

        host : str, optional
            IP address used to create RPC tracker on host machine.

        port : int, optional
            Port number used to create RPC tracker on host machine.

        n_parallel: int, optional
            The number of measurement task that can run in parallel.
            Set this according to the number of cpu cores (for compilation) and
            the number of devices you have (for measuring generate code).

        build_func: str or callable, optional
            'default': call default builder. This works for normal target (llvm, cuda)

            'ndk': use Android NDK to create shared library. Use this for android target.

            callable: customized build function for other backends (e.g. VTA).
                      See autotvm/measure/measure_methods.py::default_build_func for example.

        layout_records : str or iterator of (MeasureInput, MeasureResult). optional
            Collection of layout_transform benchmarking records.
            If is str, then it should be the filename of a records log file.
                   Each row of this file is an encoded record pair.
            Otherwise, it is an iterator.

            If this argument is set, graph tuner will first check whether layout_transform
            workload already exists in records and skip benchmarking if possible.

        target_host : str, optional
            str or :any:`tvm.target.Target` optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm intepreter is used.

        infer_layout : bool, optional
            Whether to infer layout transformation time if it doesn't exist in records, instead
            of benchmarking on target device.

            This might bring performance loss comparing to benchmarking layout transformation.
        runner : Runner, optional
            Accept a user-supplied runner
        """
        self._logger.info("Start to benchmark layout transformation...")
        self._target, target_host = Target.canon_target_and_host(self._target, target_host)

        if layout_records is None and infer_layout:
            raise RuntimeError("Requires some records to infer layout transformation time.")

        if isinstance(layout_records, str):
            layout_records = load_from_file(layout_records)
            if not layout_records and infer_layout:
                raise RuntimeError("Records must be non-empty to infer layout transformation time.")

        if isinstance(layout_records, str):
            layout_records = load_from_file(layout_records)
        num_flops, total_time = 0, 0
        if layout_records is not None:
            for record in layout_records:
                ltf_wkl = record[0].task.workload
                self._layout_transform_perf_records[ltf_wkl] = record
                input_shape = ltf_wkl[1][1]
                flops = np.prod(input_shape)
                num_flops += flops
                total_time += record[1].costs[0]
        avg_time = total_time / num_flops if num_flops > 0 else 0

        args_list = []

        def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args):
            """Callback function to fetch layout transform args"""
            _, in_layout, out_layout = args
            if in_layout != out_layout:
                args_list.append(args)

        self._iterate_layout_transform(_fetch_args_callback)

        def _log_to_list(record_list):
            """Callback to log result to a list."""

            def _callback(_, inputs, results):
                """Callback implementation"""
                record_list.append((inputs[0], results[0]))

            return _callback

        builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func)
        if use_rpc:
            if device_key is None:
                raise RuntimeError("device_key need to be set to use rpc tracker mode.")
            runner = autotvm.measure.RPCRunner(
                device_key,
                host,
                port,
                n_parallel=n_parallel,
                number=min_exec_num,
                repeat=1,
                timeout=timeout,
            )
        elif not runner:
            runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout)
        measure_option = autotvm.measure_option(builder=builder, runner=runner)
        for args in args_list:
            data, in_layout, out_layout = args
            ltf_workload = autotvm.task.args_to_workload(args, "layout_transform")
            if ltf_workload in self._layout_transform_perf_records:
                continue

            if infer_layout:
                input_shape = ltf_workload[1][1]
                flops = 1
                for i in input_shape:
                    flops *= i

                # Rule out invalid layout transformations
                out = topi.layout_transform(data, in_layout, out_layout)
                out_flops = 1
                for i in topi.utils.get_const_tuple(out.shape):
                    out_flops *= i

                if flops != out_flops:
                    inferred_time = INVALID_LAYOUT_TIME
                else:
                    inferred_time = flops * avg_time

                record_input = MeasureInput(target=self._target, task=None, config=None)
                record_output = MeasureResult(
                    costs=(inferred_time,), error_no=0, all_cost=-1, timestamp=-1
                )
                self._layout_transform_perf_records[ltf_workload] = (record_input, record_output)
                continue

            records = []
            task = autotvm.task.create("layout_transform", args=args, target=self._target)
            tuner = autotvm.tuner.GridSearchTuner(task)
            tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)])
            if not isinstance(records[0][1].costs[0], float):
                records[0] = (records[0][0], records[0][1]._replace(costs=(INVALID_LAYOUT_TIME,)))
            self._layout_transform_perf_records[ltf_workload] = records[0]

        self._iterate_layout_transform(self._create_matrix_callback)
        self._logger.info("Benchmarking layout transformation successful.")