def __setstate__(self, state): # Register the workload if needed try: workload = json.loads(state["workload_key"]) except Exception: # pylint: disable=broad-except raise RuntimeError("Invalid workload key %s" % state["workload_key"]) # workload[0] is either the compute function name or the ComputeDAG hash. # The compute functions are already registered when importing TVM, so here # we only register the ComputeDAG workloads. If the same workload has # already been registered, the later registration overrides the prvious one. if workload[0] not in WORKLOAD_FUNC_REGISTRY: register_workload_tensors(state["workload_key"], state["compute_dag"].tensors) state["target"], state["target_host"] = Target.canon_target_and_host( state["target"], state["target_host"]) self.__init_handle_by_constructor__( _ffi_api.SearchTask, state["compute_dag"], state["workload_key"], state["target"], state["target"].host, state["hardware_params"], state["layout_rewrite_option"], state["task_input_names"], state["desc"], )
def get_sample_task(target=tvm.target.cuda(), target_host=None): """return a sample task for testing""" target, target_host = Target.canon_target_and_host(target, target_host) task = autotvm.task.create("testing/conv2d_no_batching", args=(1, 7, 7, 512, 512, 3, 3), target=target) return task, target
def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option=None): """Common part for building a configuration""" target, task, config = measure_input target, task.target_host = Target.canon_target_and_host( target, task.target_host) with target: s, args = task.instantiate(config) # check invalidity of template and code hash consistency if not config.valid(): raise InstantiationError(config.errors) opts = build_option or {} if check_gpu: # Add verify pass to filter out invalid configs in advance. opts["tir.add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))] # if target is vta, we need to use vta build if (hasattr(measure_input.target, "device_name") and measure_input.target.device_name == "vta"): # pylint: disable=import-outside-toplevel import vta func = vta.build(s, args, target_host=task.target_host) else: with tvm.ir.transform.PassContext(config=opts): func = build(s, args, target_host=task.target_host, runtime=runtime) return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
def main(): """Main function""" parser = argparse.ArgumentParser() parser.add_argument("--target", type=str, default="llvm", help="The build target") parser.add_argument("--target-host", type=str, default=None, help="The host code compilation target") parser.add_argument("--rpc-host", type=str, default="127.0.0.1", help="the hostname of the server") parser.add_argument("--rpc-port", type=int, default=9090, help="The port of the RPC") args = parser.parse_args() logging.basicConfig(level=logging.INFO) args.target, args.target_host = Target.canon_target_and_host( args.target, args.target_host) measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
def test_canon_target_and_host_1(): target = None host = "llvm" with pytest.raises( AssertionError, match=r"Target host is not empty when target is empty."): target, host = Target.canon_target_and_host(target, host)
def __init__( self, func=None, args=None, compute_dag=None, workload_key=None, target=None, target_host=None, hardware_params=None, layout_rewrite_option=None, task_inputs=None, task_inputs_overwrite=False, task_inputs_save_to_file=False, desc="", ): assert ( func is not None or workload_key is not None ), "Either a workload generation function or a workload key should be provided" if func is not None: workload_key = make_workload_key(func, args) if compute_dag is None: compute_dag = ComputeDAG(workload_key) assert target is not None, "Must specify a target." target, target_host = Target.canon_target_and_host(target, target_host) if layout_rewrite_option is None: layout_rewrite_option = LayoutRewriteOption.get_target_default( target) task_input_names = [] if isinstance(task_inputs, list): task_input_names = task_inputs elif isinstance(task_inputs, dict): for input_name in task_inputs: register_task_input_buffer( workload_key, input_name, task_inputs[input_name], task_inputs_overwrite, task_inputs_save_to_file, ) task_input_names.append(input_name) elif task_inputs is not None: raise ValueError("task_inputs should be a dict or a list.") self.__init_handle_by_constructor__( _ffi_api.SearchTask, compute_dag, workload_key, target, target_host, hardware_params, layout_rewrite_option, task_input_names, desc, )
def test_canon_target_and_host_3(): target = Target(target="cuda", host="llvm") host = None target, host = Target.canon_target_and_host(target, host) assert target.kind.name == "cuda" assert target.host.kind.name == "llvm" assert host.kind.name == "llvm" assert target.host == host
def __setstate__(self, state): import cloudpickle # pylint: disable=import-outside-toplevel self.name = state["name"] self.args = state["args"] self.kwargs = state["kwargs"] self.config_space = state["config_space"] self.func = cloudpickle.loads(state["func"]) self.flop = state["flop"] self.target, self.target_host = Target.canon_target_and_host( state["target"], state["target_host"])
def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option=None): """Common part for building a configuration""" target, task, config = measure_input target, task.target_host = Target.canon_target_and_host( target, task.target_host) with target: s, args = task.instantiate(config) # check invalidity of template and code hash consistency if not config.valid(): raise InstantiationError(config.errors) # if target is vta, we need to use vta build if (hasattr(measure_input.target, "device_name") and measure_input.target.device_name == "vta"): # pylint: disable=import-outside-toplevel import vta func = vta.build(s, args, target_host=task.target_host) else: current_pass_context: tvm.ir.transform.PassContext = ( tvm.ir.transform.PassContext.current()) current_config = dict(current_pass_context.config) if build_option is not None: current_config.update(build_option) if "tir.add_lower_pass" in current_config: current_add_lower_pass = list( current_config["tir.add_lower_pass"]) else: current_add_lower_pass = [] if check_gpu: current_add_lower_pass.append( (2, gpu_verify_pass(**check_gpu))) current_config["tir.add_lower_pass"] = current_add_lower_pass with tvm.ir.transform.PassContext( opt_level=current_pass_context.opt_level, required_pass=current_pass_context.required_pass, disabled_pass=current_pass_context.disabled_pass, instruments=current_pass_context.instruments, config=current_config, ): func = build(s, args, target_host=task.target_host, runtime=runtime) return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
def __getstate__(self): self.target, self.target_host = Target.canon_target_and_host( self.target, self.target_host) return { "compute_dag": self.compute_dag, "workload_key": self.workload_key, "target": self.target, "target_host": self.target_host, "hardware_params": self.hardware_params, "layout_rewrite_option": self.layout_rewrite_option, "task_input_names": self.task_input_names, "desc": self.desc, }
def autoscheduler_get_tuning_tasks( mod: tvm.IRModule, params: Dict[str, tvm.nd.NDArray], target: str, target_host: Optional[str] = None, alter_layout: Optional[str] = None, hardware_params: Optional[HardwareParams] = None, include_simple_tasks: bool = False, ): """Get the autoscheduler tuning tasks for a given relay module. Parameters ---------- mod : tvm.IRModule The relay module from which to extract tuning tasks. params : dict The params for the relay module. target : tvm.target.Target The compilation target. target_host : str, optional The compilation target for the host. alter_layout : str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. hardware_params : Optional[HardwareParams] Hardware parameters used for the search tasks Returns ------- tasks : list of autotvm.Tasks list of tasks to be tuned weights : List[int] the weight (i.e. the number of appearance) of extracted tasks """ target, target_host = Target.canon_target_and_host(target, target_host) if alter_layout: mod = convert_graph_layout(mod, alter_layout) # Extract the tasks tasks, task_weights = auto_scheduler.extract_tasks( mod["main"], params, target=target, hardware_params=hardware_params, include_simple_tasks=include_simple_tasks, ) return tasks, task_weights
def __getstate__(self): # custom pickle implementation is required for # some unpickable local task functions. # So we only pickle the name of the function # and restore the function by name when unpickling it. import cloudpickle # pylint: disable=import-outside-toplevel self.target, self.target_host = Target.canon_target_and_host( self.target, self.target_host) return { "name": self.name, "args": self.args, "kwargs": self.kwargs, "config_space": self.config_space, "flop": self.flop, "target": self.target, "target_host": self.target_host, "func": cloudpickle.dumps(self.func), }
def _local_build_worker(inp_serialized, build_func, verbose): tic = time.time() inp = MeasureInput.deserialize(inp_serialized) task = inp.task task.target, task.target_host = Target.canon_target_and_host( task.target, task.target_host) error_no = MeasureErrorNo.NO_ERROR error_msg = None args = [] try: sch, args = task.compute_dag.apply_steps_from_state( inp.state, layout_rewrite=task.layout_rewrite_option) # pylint: disable=broad-except except Exception: error_no = MeasureErrorNo.INSTANTIATION_ERROR error_msg = make_traceback_info() if error_no == 0: dirname = tempfile.mkdtemp() filename = os.path.join(dirname, "tmp_func." + build_func.output_format) try: with transform.PassContext(): func = build_module.build(sch, args, target=task.target) func.export_library(filename, build_func) # pylint: disable=broad-except except Exception: error_no = MeasureErrorNo.COMPILE_HOST error_msg = make_traceback_info() else: filename = "" if verbose >= 1: if error_no == MeasureErrorNo.NO_ERROR: print(".", end="", flush=True) else: print(".E", end="", flush=True) # Build error return filename, args, error_no, error_msg, time.time() - tic
def autotvm_get_tuning_tasks( mod: tvm.IRModule, params: Dict[str, tvm.nd.NDArray], target: str, target_host: Optional[str] = None, alter_layout: Optional[str] = None, ): """Get the autotvm tuning tasks for a given relay module. Parameters ---------- mod : tvm.IRModule The relay module from which to extract tuning tasks. params : dict The params for the relay module. target : tvm.target.Target The compilation target. target_host : str, optional The compilation target for the host. alter_layout : str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. Returns ------- tasks : list of autotvm.Tasks list of tasks to be tuned """ target, target_host = Target.canon_target_and_host(target, target_host) if alter_layout: mod = convert_graph_layout(mod, alter_layout) tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ) return tasks
def recover_measure_input(inp, rebuild_state=False): """ Recover a deserialized MeasureInput by rebuilding the missing fields. 1. Rebuid the compute_dag in inp.task 2. (Optional) Rebuild the stages in inp.state Parameters ---------- inp: MeasureInput The deserialized MeasureInput rebuild_state: bool = False Whether rebuild the stages in MeasureInput.State Returns ------- new_input: MeasureInput The fully recovered MeasureInput with all fields rebuilt. """ # pylint: disable=import-outside-toplevel from .search_task import SearchTask # lazily import to avoid recursive dependency task = inp.task task.target, task.target_host = Target.canon_target_and_host( task.target, task.target_host) new_task = SearchTask( workload_key=task.workload_key, target=task.target, hardware_params=task.hardware_params, layout_rewrite_option=task.layout_rewrite_option, task_inputs=list(task.task_input_names), ) if rebuild_state: new_state = new_task.compute_dag.infer_bound_from_state(inp.state) else: new_state = inp.state return MeasureInput(new_task, new_state)
def create(task_name, args, target, target_host=None): """Create a tuning task and initialize its search space Parameters ---------- task_name : str The AutoTVM task name args : List Positional arguments target : Target The compilation target target_host: Target, optional The compilation target for host side Returns ------- tsk: Task a task object """ args = serialize_args(args) ret = Task(task_name, args) target, target_host = Target.canon_target_and_host(target, target_host) # init config space ret.config_space = ConfigSpace() ctx = ApplyConfig(ret.config_space) with ctx: with target: sch, _ = ret.func(*args) ret.config_space.code_hash = getattr(sch, "code_hash", None) ret.flop = ret.config_space.flop or compute_flop(sch) ret.target = target ret.target_host = target_host return ret
def compile_model( tvmc_model: TVMCModel, target: str, opt_level: int = 3, executor: Optional[Executor] = Executor("graph"), runtime: Optional[Runtime] = Runtime("cpp"), tuning_records: Optional[str] = None, package_path: Optional[str] = None, cross: Optional[Union[str, Callable]] = None, cross_options: Optional[str] = None, output_format: str = "so", dump_code: Optional[List[str]] = None, target_host: Optional[str] = None, desired_layout: Optional[str] = None, disabled_pass: Optional[str] = None, pass_context_configs: Optional[List[str]] = None, additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None, use_vm: bool = False, mod_name: Optional[str] = "default", ): """Compile a model from a supported framework into a TVM module. This function takes a union of the arguments of both frontends.load_model and compiler.compile_relay. The resulting TVM module can be executed using the graph executor. Parameters ---------- tvmc_model : TVMCModel The model object that should be compiled. target : str The target for which to compile. Can be a plain string or a path. opt_level : int The option that controls various sorts of optimizations. tuning_records : str A path to tuning records produced using tvmc.tune. When provided, compilation will use more optimized kernels leading to better results. package_path : str, optional The path to export the compiled model to. If not provided it will be saved in a temporary directory. cross : str or callable object, optional Function that performs the actual compilation cross_options : str, optional Command line options to be passed to the cross compiler. output_format : str What format to use when saving the function library. Must be one of "so" or "tar". When compiling for a remote device without a cross compiler, "tar" will likely work better. dump_code : list, optional Dump the generated code for the specified source types, on the requested target. target_host : str, optional The target of the host machine if host-side code needs to be generated. desired_layout: str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. disabled_pass: str, optional Comma-separated list of passes which needs to be disabled during compilation pass_context_configs: list[str], optional List of strings containing a set of configurations to be passed to the PassContext. additional_target_options: Optional[Dict[str, Dict[str, Any]]] Additional target options in a dictionary to combine with initial Target arguments use_vm: bool Whether to use the VM to compile the model as opposed to the graph executor mod_name: str, optional The module name Returns ------- compiled_model : TVMCPackage The compiled TVMCModel ready to be run. """ mod, params = tvmc_model.mod, tvmc_model.params config = parse_configs(pass_context_configs) if desired_layout: mod = convert_graph_layout(mod, desired_layout) tvm_target, extra_targets = target_from_cli(target, additional_target_options) tvm_target, target_host = Target.canon_target_and_host(tvm_target, target_host) for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] if codegen["config_key"] is not None: config[codegen["config_key"]] = codegen_from_cli["opts"] with tvm.transform.PassContext(config=config): mod = partition_function(mod, params, mod_name=mod_name, **codegen_from_cli["opts"]) if tuning_records and os.path.exists(tuning_records): logger.debug("tuning records file provided: %s", tuning_records) use_autoscheduler = True try: auto_scheduler.load_records(tuning_records) except tvm._ffi.base.TVMError: use_autoscheduler = False if use_autoscheduler: with auto_scheduler.ApplyHistoryBest(tuning_records): config["relay.backend.use_auto_scheduler"] = True with tvm.transform.PassContext( opt_level=opt_level, config=config, disabled_pass=disabled_pass ): logger.debug("building relay graph with autoscheduler") graph_module = build( mod, tvm_target=tvm_target, executor=executor, runtime=runtime, params=params, use_vm=use_vm, mod_name=mod_name, ) else: with autotvm.apply_history_best(tuning_records): with tvm.transform.PassContext( opt_level=opt_level, config=config, disabled_pass=disabled_pass ): logger.debug("building relay graph with tuning records") graph_module = build( mod, tvm_target=tvm_target, executor=executor, runtime=runtime, params=params, use_vm=use_vm, mod_name=mod_name, ) else: with tvm.transform.PassContext( opt_level=opt_level, config=config, disabled_pass=disabled_pass ): logger.debug("building relay graph (no tuning records provided)") graph_module = build( mod, tvm_target=tvm_target, executor=executor, runtime=runtime, params=params, use_vm=use_vm, mod_name=mod_name, ) # Generate output dump files with sources if dump_code is None: dump_code = [] if not isinstance(dump_code, list): dump_code = [dump_code] dumps = {} for source_type in dump_code: if use_vm: lib = graph_module.lib else: lib = graph_module.get_lib() # TODO lib.get_source call have inconsistent behavior for unsupported # formats (@leandron). source = str(mod) if source_type == "relay" else lib.get_source(source_type) dumps[source_type] = source # Create a new tvmc model package object from the graph definition. package_path = tvmc_model.export_package( graph_module, package_path, cross, cross_options, output_format ) # Write dumps to file. if dumps: save_dumps(package_path, dumps) return TVMCPackage(package_path)
def test_canon_target_and_host_2(): target = Target("cuda") host = Target("llvm") target, host = Target.canon_target_and_host(target, host) assert target.kind.name == "cuda" assert target.host.kind.name == "llvm"
def test_canon_target_and_host_0(): target = None host = None target, host = Target.canon_target_and_host(target, host) assert target is None assert host is None
def extract_tasks( mod, params, target, target_host=None, hardware_params=None, include_simple_tasks=False, dump_workload_to_dag_log=None, opt_level=3, ): """Extract tuning tasks from a relay program. Parameters ---------- mod: tvm.IRModule or relay.function.Function The module or function to tune params: dict of str to numpy array The associated parameters of the program target: Union[tvm.target.Target, str] The compilation target target_host: Optional[Union[tvm.target.Target, str]] The host compilation target hardware_params : Optional[HardwareParams] Hardware parameters used for the search tasks include_simple_tasks: bool Whether to extract simple tasks that do not include complicated ops. dump_workload_to_dag_log: Optional[str] A file to dump an association between the workload keys and the actual DAG opt_level : Optional[int] The optimization level of the task extractions. Returns ------- tasks: List[SearchTask] The tasks in this network weights: List[int] The weight (i.e. the number of appearance) of extracted tasks """ # pylint: disable=import-outside-toplevel target, target_host = Target.canon_target_and_host(target, target_host) # Run the compiler to collect all TOPI calls during compilation. env = TracingEnvironment( TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY ) dispatch_ctx = DispatchContext.current old_verbose = dispatch_ctx.verbose dispatch_ctx.verbose = 0 errors = [] with env: # Wrap build call in a new thread to avoid the conflict # between python's multiprocessing and tvm's thread pool build_thread = threading.Thread( target=call_all_topi_funcs, args=(mod, params, target, errors, opt_level) ) build_thread.start() build_thread.join() if errors: error_strings = ["Task extraction had the following errors:"] + errors raise TVMError("\n".join(error_strings)) dispatch_ctx.verbose = old_verbose # create search tasks tasks = [] weights = [] for wkl_key, (weight, func_names) in env.wkl_key_to_weight.items(): tasks.append( SearchTask( workload_key=wkl_key, target=target, hardware_params=hardware_params, # When auto scheduler is used in end to end network, try to apply layout rewrite # to improve the overall performance layout_rewrite_option=LayoutRewriteOption.get_target_default(target, True), task_inputs=( env.wkl_key_to_input_names[wkl_key] if wkl_key in env.wkl_key_to_input_names else None ), task_inputs_save_to_file=True, desc=",".join(func_names), ) ) weights.append(int(weight)) if dump_workload_to_dag_log is not None: with open(dump_workload_to_dag_log, "w") as f: json.dump({task.workload_key: str(task.compute_dag) for task in tasks}, f) return tasks, weights
def tune_model( tvmc_model: TVMCModel, target: str, tuning_records: Optional[str] = None, prior_records: Optional[str] = None, enable_autoscheduler: bool = False, rpc_key: Optional[str] = None, hostname: Optional[str] = None, port: Optional[Union[int, str]] = 9090, trials: int = 10000, target_host: Optional[str] = None, tuner: str = "xgb", min_repeat_ms: Optional[int] = None, early_stopping: Optional[int] = None, desired_layout: Optional[str] = None, timeout: int = 10, repeat: int = 1, number: int = 10, parallel: int = 4, hardware_params: Optional[HardwareParams] = None, include_simple_tasks: bool = False, log_estimated_latency: bool = False, additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None, ): """Use tuning to automatically optimize the functions in a model. Parameters ---------- tvmc_model : TVMCModel The model to be optimized. target : str Compilation target as plain string, inline JSON or path to a JSON file. tuning_records: str, optional The path to a file that tuning results will be saved to. If not specified, a temporary file will be used. prior_records: str, optional A path to previous tuning results that will be used to hot-start the tuning cost model if provided. enable_autoscheduler : bool, optional When true, use autoscheduling rather than autotvm. This should produce faster kernels for compatible model-target pairs. rpc_key : str, optional The RPC tracker key of the target device. Required when rpc_tracker is provided. hostname : str, optional The IP address of an RPC tracker, used when benchmarking remotely. port : int or str, optional The port of the RPC tracker to connect to. Defaults to 9090. trials : int, optional The number of schedules to try out for the entire model. Note that the default value is chosen as a decent average for most models, but larger models may need more trials to reach a good result while smaller models will converge with fewer trials. tuner : str, optional The type of tuner to use when tuning with autotvm. Can be one of "ga", "gridsearch", "random", "xgb", "xgb_knob", and "xgb-rank". min_repeat_ms : int, optional Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets. early_stopping : int, optional When specified, stop tuning after this number of trials if results aren't improving. desired_layout : str, optional Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph will have their layout set to this format. Tasks will then be tuned using this specified layout. timeout : int, optional, If a kernel trial lasts longer than this duration in seconds, it will be considered a failure. repeat : int, optional How many times each measurement should be repeated. number : int, optional The number of runs a single repeat is made of. parallel : int, optional The maximum number of parallel devices to use when tuning. hardware_params : auto_scheduler.HardwareParams, optional When using the autoscheduler, this object defines the configuration of the target hardware. include_simple_tasks : bool, optional Whether to extract simple operations or only computationally intensive ones when using the autoscheduler. log_estimated_latency : bool, optional If using the autoscheduler, write the estimated latency at each step of tuning to file. additional_target_options: Optional[Dict[str, Dict[str, Any]]] Additional target options in a dictionary to combine with initial Target arguments Returns ------- tuning_records : str The path to the produced tuning log file. """ target, extra_targets = target_from_cli(target, additional_target_options) target, target_host = Target.canon_target_and_host(target, target_host) # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source # model is fixed. For now, creating a clone avoids the issue. mod = deepcopy(tvmc_model.mod) params = tvmc_model.params if tuning_records is None: tuning_records = tvmc_model.default_tuning_records_path() for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] mod = partition_function(mod, params, **codegen_from_cli["opts"]) # min_repeat_ms should be: # a. the value provided by the user, if any, or # b. 0ms in case target is "cpu"; otherwise 1000ms if min_repeat_ms is None: min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000 logger.info("Default --min-repeat-ms for this target is %s", min_repeat_ms) if rpc_key: if hostname is None or port is None: raise TVMCException( "You must provide a hostname and port to connect to a remote RPC device." ) if isinstance(port, str): port = int(port) logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key, hostname, port) runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner runner = runner_ctor( key=rpc_key, host=hostname, port=port, number=number, repeat=repeat, n_parallel=parallel, timeout=timeout, min_repeat_ms=min_repeat_ms, ) else: logger.info("Starting localhost tuning.") runner_ctor = (auto_scheduler.LocalRPCMeasureContext if enable_autoscheduler else autotvm.LocalRunner) local_server = runner_ctor( number=number, repeat=repeat, timeout=timeout, min_repeat_ms=min_repeat_ms, ) # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object. if enable_autoscheduler: runner = local_server.runner else: runner = local_server if enable_autoscheduler: tasks, weights = autoscheduler_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, hardware_params=hardware_params, include_simple_tasks=include_simple_tasks, ) # Create the autoscheduler tuning options tuning_options = auto_scheduler.TuningOptions( num_measure_trials=trials, measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)], runner=runner, early_stopping=early_stopping, ) logger.info("Autoscheduling with configuration: %s", tuning_options) # Schedule the tasks (i.e., produce a schedule for each task) schedule_tasks(tasks, weights, tuning_options, prior_records, log_estimated_latency) else: tasks = autotvm_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, ) # In autotvm, trials is specified per task. We can convert the per-model input # provided to per-task trials by dividing by the number of tasks. trials = int(trials / max(len(tasks), 1)) logger.info("Autotuning with %d trials per task.", trials) tuning_options = { "tuner": tuner, "trials": trials, "early_stopping": early_stopping, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner), "tuning_records": prior_records, } logger.info("Autotuning with configuration: %s", tuning_options) tune_tasks(tasks, tuning_records, **tuning_options) return tuning_records
def benchmark_layout_transform( self, min_exec_num=100, timeout=10, use_rpc=False, device_key=None, host="127.0.0.1", port=9190, n_parallel=1, build_func="default", layout_records=None, target_host=None, infer_layout=False, runner=None, ): """Benchmark all possible layout transformation in the graph, given a set of schedule candidates for each workload of target operator. Parameters ---------- min_exec_num : int, optional Minimum number of execution. Final execution time is the average of all execution time. timeout : int, optional Time out for each execution. use_rpc : boolean, optional Whether to use rpc mode for benchmarking. device_key : str, optional Remote device key which can be queried by python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 host : str, optional IP address used to create RPC tracker on host machine. port : int, optional Port number used to create RPC tracker on host machine. n_parallel: int, optional The number of measurement task that can run in parallel. Set this according to the number of cpu cores (for compilation) and the number of devices you have (for measuring generate code). build_func: str or callable, optional 'default': call default builder. This works for normal target (llvm, cuda) 'ndk': use Android NDK to create shared library. Use this for android target. callable: customized build function for other backends (e.g. VTA). See autotvm/measure/measure_methods.py::default_build_func for example. layout_records : str or iterator of (MeasureInput, MeasureResult). optional Collection of layout_transform benchmarking records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. Otherwise, it is an iterator. If this argument is set, graph tuner will first check whether layout_transform workload already exists in records and skip benchmarking if possible. target_host : str, optional str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. infer_layout : bool, optional Whether to infer layout transformation time if it doesn't exist in records, instead of benchmarking on target device. This might bring performance loss comparing to benchmarking layout transformation. runner : Runner, optional Accept a user-supplied runner """ self._logger.info("Start to benchmark layout transformation...") self._target, target_host = Target.canon_target_and_host(self._target, target_host) if layout_records is None and infer_layout: raise RuntimeError("Requires some records to infer layout transformation time.") if isinstance(layout_records, str): layout_records = load_from_file(layout_records) if not layout_records and infer_layout: raise RuntimeError("Records must be non-empty to infer layout transformation time.") if isinstance(layout_records, str): layout_records = load_from_file(layout_records) num_flops, total_time = 0, 0 if layout_records is not None: for record in layout_records: ltf_wkl = record[0].task.workload self._layout_transform_perf_records[ltf_wkl] = record input_shape = ltf_wkl[1][1] flops = np.prod(input_shape) num_flops += flops total_time += record[1].costs[0] avg_time = total_time / num_flops if num_flops > 0 else 0 args_list = [] def _fetch_args_callback(from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args): """Callback function to fetch layout transform args""" _, in_layout, out_layout = args if in_layout != out_layout: args_list.append(args) self._iterate_layout_transform(_fetch_args_callback) def _log_to_list(record_list): """Callback to log result to a list.""" def _callback(_, inputs, results): """Callback implementation""" record_list.append((inputs[0], results[0])) return _callback builder = autotvm.LocalBuilder(n_parallel=n_parallel, build_func=build_func) if use_rpc: if device_key is None: raise RuntimeError("device_key need to be set to use rpc tracker mode.") runner = autotvm.measure.RPCRunner( device_key, host, port, n_parallel=n_parallel, number=min_exec_num, repeat=1, timeout=timeout, ) elif not runner: runner = autotvm.LocalRunner(number=min_exec_num, repeat=1, timeout=timeout) measure_option = autotvm.measure_option(builder=builder, runner=runner) for args in args_list: data, in_layout, out_layout = args ltf_workload = autotvm.task.args_to_workload(args, "layout_transform") if ltf_workload in self._layout_transform_perf_records: continue if infer_layout: input_shape = ltf_workload[1][1] flops = 1 for i in input_shape: flops *= i # Rule out invalid layout transformations out = topi.layout_transform(data, in_layout, out_layout) out_flops = 1 for i in topi.utils.get_const_tuple(out.shape): out_flops *= i if flops != out_flops: inferred_time = INVALID_LAYOUT_TIME else: inferred_time = flops * avg_time record_input = MeasureInput(target=self._target, task=None, config=None) record_output = MeasureResult( costs=(inferred_time,), error_no=0, all_cost=-1, timestamp=-1 ) self._layout_transform_perf_records[ltf_workload] = (record_input, record_output) continue records = [] task = autotvm.task.create("layout_transform", args=args, target=self._target) tuner = autotvm.tuner.GridSearchTuner(task) tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)]) if not isinstance(records[0][1].costs[0], float): records[0] = (records[0][0], records[0][1]._replace(costs=(INVALID_LAYOUT_TIME,))) self._layout_transform_perf_records[ltf_workload] = records[0] self._iterate_layout_transform(self._create_matrix_callback) self._logger.info("Benchmarking layout transformation successful.")