def test_canon_target_and_host_0(): target = None host = None target, host = Target.canon_target_and_host(target, host) assert target is None assert host is None
def schedule_hwnc_tensorcore_cuda(cfg, s, Conv): """Schedule tensorcore template""" packed_data, packed_kernel = s[Conv].op.input_tensors ic, kh, kw, ii = s[Conv].op.reduce_axis pad_data = s[packed_data].op.input_tensors[0] block_x = te.thread_axis('blockIdx.x') block_y = te.thread_axis('blockIdx.y') block_z = te.thread_axis('blockIdx.z') thread_x = te.thread_axis('threadIdx.x') thread_y = te.thread_axis('threadIdx.y') thread_z = te.thread_axis('threadIdx.z') # Designate the memory hierarchy AS = s.cache_read(packed_data, 'shared', [Conv]) WS = s.cache_read(packed_kernel, 'shared', [Conv]) AF = s.cache_read(AS, 'wmma.matrix_a', [Conv]) WF = s.cache_read(WS, 'wmma.matrix_b', [Conv]) ConvF = s.cache_write(Conv, 'wmma.accumulator') if Conv.op in s.outputs: output = Conv ConvS = s.cache_read(ConvF, 'shared', [Conv]) OL = ConvS else: output = s.outputs[0].output(0) s[Conv].set_scope('shared') OL = Conv out_dtype = Conv.dtype if isinstance(packed_kernel.op, te.tensor.ComputeOp) and packed_kernel.name == "packed_kernel": if autotvm.GLOBAL_SCOPE.in_tuning: s[packed_kernel].pragma( s[packed_kernel].op.axis[0], "debug_skip_region") else: with Target('cuda'): schedule_injective_from_existing(s, packed_kernel) if isinstance(pad_data.op, te.tensor.ComputeOp) and "pad" in pad_data.op.tag: s[pad_data].compute_inline() data = pad_data.op.input_tensors[0] if autotvm.GLOBAL_SCOPE.in_tuning: # skip this part during tuning to make recrods accurate # this part will be pre-computed during NNVM's pre-compute optimization pass s[pad_data].pragma(s[pad_data].op.axis[0], "debug_skip_region") else: data = pad_data s[data].compute_inline() data_dtype = data.dtype kernel_dtype = packed_kernel.dtype # Schedule for autotvm cfg.define_knob("block_row_warps", [1, 2, 4]) cfg.define_knob("block_col_warps", [1, 2, 4]) cfg.define_knob("warp_row_tiles", [1, 2, 4, 8, 16]) cfg.define_knob("warp_col_tiles", [1, 2, 4, 8, 16]) cfg.define_knob("chunk", [1, 2, 4, 8]) cfg.define_knob("fuse_pack", [0, 1]) cfg.define_knob("split_block_k_nums", [1, 2, 4, 8, 16, 32]) cfg.define_knob("vector_ws", [1, 8]) cfg.define_knob("vector_as", [1, 8, 16]) block_row_warps = cfg["block_row_warps"].val block_col_warps = cfg["block_col_warps"].val warp_row_tiles = cfg["warp_row_tiles"].val warp_col_tiles = cfg["warp_col_tiles"].val chunk = cfg["chunk"].val vector_as = cfg["vector_as"].val vector_ws = cfg["vector_ws"].val split_block_k_nums = cfg["split_block_k_nums"].val fuse_pack = cfg["fuse_pack"].val if not fuse_pack: s[packed_data].compute_inline() else: with Target('cuda'): schedule_injective_from_existing(s, packed_data) if data_dtype in ['int4', 'uint4']: wmma_m = wmma_n = 8 wmma_k = 32 else: wmma_m = 8 wmma_n = 32 wmma_k = 16 warp_size = 32 # Schedule for output if len(s[output].op.axis) == 4: hc, wc, nc, oc, = output.op.axis nc, nnc = s[output].split(nc, factor=wmma_m) oc, ooc = s[output].split(oc, factor=wmma_n) else: hc, wc, nc, oc, nnc, ooc = output.op.axis kernel_scope, hc = s[output].split(hc, nparts=1) block_k = s[output].fuse(hc, wc) block_k, split_block_k = s[output].split( block_k, factor=split_block_k_nums) nc, nci = s[output].split(nc, factor=warp_row_tiles) block_i, nc = s[output].split(nc, factor=block_row_warps) oc, oci = s[output].split(oc, factor=warp_col_tiles) block_j, oc = s[output].split(oc, factor=block_col_warps) s[output].reorder(block_k, split_block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) t = s[output].fuse(nnc, ooc) _, tx = s[output].split(t, factor=warp_size) s[output].bind(block_k, block_z) s[output].bind(block_i, block_x) s[output].bind(block_j, block_y) s[output].bind(tx, thread_x) s[output].bind(nc, thread_y) s[output].bind(oc, thread_z) # Schedule wmma store s[OL].compute_at(s[output], block_j) hc, wc, nc, oc, nnc, ooc = OL.op.axis oc, oci = s[OL].split(oc, factor=warp_col_tiles) _, oc = s[OL].split(oc, factor=block_col_warps) nc, nci = s[OL].split(nc, factor=warp_row_tiles) _, nc = s[OL].split(nc, factor=block_row_warps) s[OL].reorder(nc, oc, nci, oci, nnc, ooc) s[OL].bind(nc, thread_y) s[OL].bind(oc, thread_z) # Schedule local computation s[ConvF].compute_at(s[OL], oc) _, _, n, o, nnf, oof = ConvF.op.axis ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) cfg.define_reorder("reorder_inner", [ko, kh], policy="all") cfg["reorder_inner"].apply(s, ConvF, [ko, kh]) cfg["reorder_inner"].apply(s, ConvF, [ki, kw]) cfg.define_knob("compute_at_AS", [0, 1, 2, 3]) cfg.define_knob("compute_at_WS", [0, 1, 2, 3]) compute_at_AS = cfg["compute_at_AS"].val compute_at_WS = cfg["compute_at_WS"].val # Move intermediate computation into each output compute tile s[AF].compute_at(s[ConvF], kw) s[WF].compute_at(s[ConvF], kw) # Schedule for A's share memory if compute_at_AS == 0: s[AS].compute_at(s[ConvF], ki) elif compute_at_AS == 1: s[AS].compute_at(s[ConvF], kw) elif compute_at_AS == 2: s[AS].compute_at(s[ConvF], ko) else: s[AS].compute_at(s[ConvF], kh) _, _, n, _, nn, ii = AS.op.axis tx, xo = s[AS].split(n, nparts=block_row_warps) ty, _ = s[AS].split(xo, nparts=block_col_warps) t = s[AS].fuse(nn, ii) to, ti = s[AS].split(t, nparts=warp_size) ti, _t = s[AS].split(ti, factor=vector_as) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(to, thread_x) s[AS].vectorize(_t) # Schedule for W's share memory if compute_at_WS == 0: s[WS].compute_at(s[ConvF], ki) elif compute_at_WS == 1: s[WS].compute_at(s[ConvF], kw) elif compute_at_WS == 2: s[WS].compute_at(s[ConvF], ko) else: s[WS].compute_at(s[ConvF], kh) s[WS].compute_at(s[ConvF], kw) kh, kw, ic, o, ii, oo = WS.op.axis tx, xo = s[WS].split(o, nparts=block_row_warps) ty, _ = s[WS].split(xo, nparts=block_col_warps) t = s[WS].fuse(ii, oo) to, ti = s[WS].split(t, nparts=warp_size) ti, _t = s[WS].split(ti, factor=vector_ws) s[WS].bind(tx, thread_y) s[WS].bind(ty, thread_z) s[WS].bind(to, thread_x) s[WS].vectorize(ti) # double buffer cfg.define_knob('AS_double_buffer', [0, 1]) cfg.define_knob('WS_double_buffer', [0, 1]) if cfg['AS_double_buffer'].val: s[AS].double_buffer() if cfg['WS_double_buffer'].val: s[WS].double_buffer() # unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) s[output].pragma(kernel_scope, 'unroll_explicit', False) shape = (wmma_m, wmma_n, wmma_k) AS_shape = (wmma_m, wmma_k) AL_shape = (wmma_m, wmma_k) WS_shape = (wmma_n, wmma_k) WL_shape = (wmma_n, wmma_k) CL_shape = (wmma_m, wmma_n) CS_shape = (wmma_m, wmma_n) AL_gemm = te.placeholder(AL_shape, name='A', dtype=data_dtype) WL_gemm = te.placeholder(WL_shape, name='B', dtype=kernel_dtype) k_gemm = te.reduce_axis((0, wmma_k), name="k") CL_compute = te.compute(CL_shape, lambda ii, jj: te.sum((AL_gemm[ii, k_gemm].astype( 'int32') * WL_gemm[jj, k_gemm].astype('int32')), axis=k_gemm), name='C') AL_strides = [wmma_k, 1] AS_strides = [wmma_k, 1] WL_strides = [wmma_k, 1] WS_strides = [wmma_k, 1] CL_strides = [wmma_n, 1] CS_strides = [wmma_n, 1] s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix_A(AL_strides, AS_strides, shape, "row_major", AS_shape, AL_shape, data_dtype)) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix_W(WL_strides, WS_strides, shape, "col_major", WS_shape, WL_shape, kernel_dtype)) s[OL].tensorize(nnc, intrin_wmma_store_matrix(CS_strides, CL_strides, shape, out_dtype, CL_shape, CS_shape)) s[ConvF].tensorize(nnf, intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides, WL_strides, CL_strides, shape)) return s
def build( ir_mod, target=None, target_host=None, executor=Executor("graph"), runtime=Runtime("cpp"), workspace_memory_pools=None, params=None, mod_name="default", ): # fmt: off # pylint: disable=line-too-long """Helper function that builds a Relay function to run on TVM graph executor. Parameters ---------- ir_mod : :py:class:`~tvm.IRModule` The IR module to build. Using relay.Function is deprecated. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm interpreter is used. executor : Optional[Executor] The executor configuration with which to build the model. Defaults to "graph" if no executor specified. runtime : Optional[Runtime] Runtime configuration to use when building the model. Defaults to "cpp" if no runtime specified. workspace_memory_pools : Optional[WorkspaceMemoryPools] The object that contains an Array of PoolInfo objects that hold properties of workspace pools that could be used by the inference. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. mod_name: Optional[str] The module name we will build Returns ------- factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule The runtime factory for the TVM graph executor. """ # pylint: enable=line-too-long # fmt: on if not isinstance(ir_mod, (IRModule, _function.Function)): raise ValueError("Type of input parameter mod must be tvm.IRModule") if isinstance(ir_mod, _function.Function): if params: ir_mod = bind_params_by_name(ir_mod, params) ir_mod = IRModule.from_expr(ir_mod) warnings.warn( "Please use input parameter mod (tvm.IRModule) " "instead of deprecated parameter mod (tvm.relay.function.Function)", DeprecationWarning, ) if target_host is not None: warnings.warn( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False ) target = build_target_by_device_type_map(target) if isinstance(target_host, (str, Target)): target_host = Target(target_host) elif target_host: raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None") # All of this logic is to raise deprecation warnings for various parameters # TODO(Mousius) Remove these after some time deprecated_params_target = target_host or list(target.values())[0] deprecated_executor, deprecated_runtime = _reconstruct_from_deprecated_options( deprecated_params_target ) if deprecated_executor: executor = deprecated_executor if deprecated_runtime: runtime = deprecated_runtime # If current dispatch context is fallback context (the default root context), # then load pre-tuned parameters from TopHub if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext): tophub_context = autotvm.tophub.context(list(target.values())) else: tophub_context = autotvm.utils.EmptyContext() with tophub_context: bld_mod = BuildModule() graph_json, runtime_mod, params = bld_mod.build( mod=ir_mod, target=target, params=params, executor=executor, runtime=runtime, workspace_memory_pools=workspace_memory_pools, mod_name=mod_name, ) func_metadata = bld_mod.get_function_metadata() devices = bld_mod.get_devices() lowered_ir_mods = bld_mod.get_irmodule() executor_codegen_metadata = bld_mod.get_executor_codegen_metadata() if str(executor) == "aot": executor_factory = _executor_factory.AOTExecutorFactoryModule( ir_mod, lowered_ir_mods, target, executor, runtime, runtime_mod, mod_name, params, func_metadata, executor_codegen_metadata, devices, ) elif str(executor) == "graph": executor_factory = _executor_factory.GraphExecutorFactoryModule( ir_mod, target, executor, graph_json, runtime_mod, mod_name, params, func_metadata ) else: assert False, "Executor " + executor + " not supported" return executor_factory
def extract_from_multiple_program(mods, params, target, target_host=None, ops=None): """Extract tuning tasks from multiple relay programs. This function collects tuning tasks by building a list of programs with a "tracing" target and tracing all the calls to topi. Parameters ---------- mods: List[tvm.IRModule] or List[relay.function.Function] The list of modules or functions to tune params: List of dict of str to numpy array The associated parameters of the programs target: tvm.target.Target The compilation target target_host: tvm.target.Target The host compilation target ops: List[tvm.ir.Op] or None List of relay ops to be tuned. If not specified, all tunable ops will be extracted. Returns ------- task: Array of autotvm.task.Task collected tasks """ # pylint: disable=import-outside-toplevel from tvm import relay from tvm import topi env = TaskExtractEnv.get() # merge target and target host target, target_host = Target.check_and_update_host_consist(target, target_host) # run compiler to collect all TOPI calls during compilation env.reset(ops) with env: # disable logger temporarily old_state = logger.disabled logger.disabled = True for mod, param in zip(mods, params): if isinstance(mod, relay.function.Function): mod = tvm.IRModule.from_expr(mod) assert isinstance( mod, tvm.IRModule ), "only support relay Module or Function to be tuned" relay.backend.te_compiler.get().clear() # wrap build call in thread to avoid multiprocessing problems build_thread = threading.Thread(target=_lower, args=(mod, target, param)) build_thread.start() build_thread.join() relay.backend.te_compiler.get().clear() # Clear the warning message cache in FallbackContext if isinstance(DispatchContext.current, FallbackContext): DispatchContext.current.memory = {} DispatchContext.warning_messages = set() logger.disabled = old_state # create tasks for target tasks = [] for task_name, args in env.get_tasks(): try: tsk = create(task_name, args, target=target) tasks.append(tsk) except topi.InvalidShapeError: logger.warning("Invalid shape during AutoTVM task creation") return tasks
def test_check_and_update_host_consist_2(): target = Target("cuda") host = Target("llvm") target, host = Target.check_and_update_host_consist(target, host) assert target.kind.name == "cuda" assert target.host.kind.name == "llvm"
def test_resnet_subgraph(algorithm, workspace_size): target = Target("c") global_workspace_pool = usmp_utils.PoolInfo( pool_name="global_workspace", target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS}, ) tir_mod = ResnetStructure tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target) tir_mod = _assign_poolinfos_to_allocates_in_irmodule( tir_mod, [global_workspace_pool]) main_func = tir_mod["tvmgen_default_run_model"] buffer_info_analysis = tvm.tir.usmp.analysis.extract_buffer_info( main_func, tir_mod) assert buffer_info_analysis.memory_pressure == 7200256 fcreate_array_bi = tvm.get_global_func("tir.usmp.CreateArrayBufferInfo") buffer_info_arr = fcreate_array_bi(buffer_info_analysis.buffer_info_stmts) fusmp_algo = tvm.get_global_func(f"tir.usmp.algo.{algorithm}") buffer_pool_allocations = fusmp_algo(buffer_info_arr, buffer_info_analysis.memory_pressure) buffer_info_map_names = dict() for buf_info in buffer_info_arr: buffer_info_map_names[buf_info.name_hint] = buf_info # check conflicts _verify_conflicts( "sid_7", [ "PaddedInput_1", "sid_2", "Conv2dOutput_1", "PaddedInput_2", ], buffer_info_map_names, ) _verify_conflicts( "Conv2dOutput_3", [ "PaddedInput_3", "sid_6", ], buffer_info_map_names, ) _verify_conflicts( "sid_6", [ "Conv2dOutput_2", "PaddedInput_2", "sid_2", "PaddedInput_3", "Conv2dOutput_3", ], buffer_info_map_names, ) _verify_conflicts( "Conv2dOutput", [ "sid_8", "sid_2", "PaddedInput", ], buffer_info_map_names, ) _verify_conflicts( "PaddedInput_3", [ "sid_6", "sid_2", "Conv2dOutput_3", ], buffer_info_map_names, ) _verify_conflicts( "Conv2dOutput_2", [ "PaddedInput_2", "sid_2", "sid_6", ], buffer_info_map_names, ) _verify_conflicts( "PaddedInput_1", [ "sid_8", "sid_2", "sid_7", "Conv2dOutput_1", ], buffer_info_map_names, ) _verify_conflicts( "Conv2dOutput_1", [ "sid_7", "PaddedInput_1", "sid_2", ], buffer_info_map_names, ) _verify_conflicts( "PaddedInput", [ "sid_2", "sid_8", "Conv2dOutput", ], buffer_info_map_names, ) _verify_conflicts( "sid_8", [ "PaddedInput", "sid_2", "Conv2dOutput", "PaddedInput_1", ], buffer_info_map_names, ) _verify_conflicts( "sid_2", [ "PaddedInput", "sid_8", "Conv2dOutput", "PaddedInput_1", "sid_7", "Conv2dOutput_1", "PaddedInput_2", "Conv2dOutput_2", "sid_6", "PaddedInput_3", ], buffer_info_map_names, ) _verify_conflicts( "PaddedInput_2", [ "sid_7", "sid_2", "Conv2dOutput_2", "sid_6", ], buffer_info_map_names, ) _check_max_workspace_size(buffer_pool_allocations, global_workspace_pool, workspace_size)
def _target() -> Target: return Target("cuda --max_threads_per_block=1024", host="llvm")
def test_canon_multi_target_and_host_3(): raw_targets = Target.canon_multi_target_and_host(["llvm", "cuda"]) assert len(raw_targets) == 2 assert raw_targets[0].kind.name == "llvm" assert raw_targets[1].kind.name == "cuda"
def test_canon_multi_target_and_host_5(): raw_targets = Target.canon_multi_target_and_host("cuda", "llvm") assert len(raw_targets) == 1 assert raw_targets[0].kind.name == "cuda" assert raw_targets[0].host.kind.name == "llvm"
def test_canon_multi_target_and_host_1(): raw_targets = Target.canon_multi_target_and_host({"kind": "llvm"}) assert len(raw_targets) == 1 assert raw_targets[0].kind.name == "llvm"
def test_canon_multi_target_and_host_2(): raw_targets = Target.canon_multi_target_and_host({1: "llvm", 2: "cuda"}) assert len(raw_targets) == 2 assert raw_targets[0].kind.name == "llvm" assert raw_targets[1].kind.name == "cuda"
def test_canon_multi_target_and_host_0(): with pytest.raises(AssertionError): Target.canon_multi_target_and_host(None)
def test_canon_target_and_host_2(): target = Target("cuda") host = Target("llvm") target, host = Target.canon_target_and_host(target, host) assert target.kind.name == "cuda" assert target.host.kind.name == "llvm"
def test_canon_target_and_host_1(): target = None host = "llvm" with pytest.raises(AssertionError, match=r"Target host is not empty when target is empty."): target, host = Target.canon_target_and_host(target, host)
def _target() -> Target: return Target("cuda", host="llvm")
def _target(target: Union[str, Target]) -> Target: if isinstance(target, str): target = Target(target) if not isinstance(target, Target): raise TypeError(f"Expected `target` to be str or Target, but gets: {target}") return target
def test_fanout(algorithm, workspace_size): """ The test case here represent BufferInfo objects that could get generated for a fanout topology such as : (Op A) | bi_a --------- | | (Op B) (Op C) | | bi_b bi_c | | (Op D) (Op E) | | bi_d bi_e | | (Op F) ------ | bi_f | (Op G) | bi_g """ target = Target("c") global_workspace_pool = usmp_utils.PoolInfo( pool_name="global_workspace", target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS}, ) bi_a = usmp_utils.BufferInfo(name_hint="bi_a", size_bytes=10, pool_candidates=[global_workspace_pool]) bi_b = usmp_utils.BufferInfo(name_hint="bi_b", size_bytes=20, pool_candidates=[global_workspace_pool]) bi_c = usmp_utils.BufferInfo(name_hint="bi_c", size_bytes=100, pool_candidates=[global_workspace_pool]) bi_d = usmp_utils.BufferInfo(name_hint="bi_d", size_bytes=40, pool_candidates=[global_workspace_pool]) bi_e = usmp_utils.BufferInfo(name_hint="bi_e", size_bytes=50, pool_candidates=[global_workspace_pool]) bi_f = usmp_utils.BufferInfo(name_hint="bi_f", size_bytes=60, pool_candidates=[global_workspace_pool]) bi_g = usmp_utils.BufferInfo(name_hint="bi_g", size_bytes=70, pool_candidates=[global_workspace_pool]) # Creating conflicts for a linear graph bi_a.set_conflicts([bi_b, bi_c]) bi_b.set_conflicts([bi_a, bi_c, bi_e]) bi_c.set_conflicts([bi_e, bi_a, bi_b, bi_d]) bi_d.set_conflicts([bi_b, bi_f, bi_c, bi_e]) bi_e.set_conflicts([bi_c, bi_f, bi_b, bi_d]) bi_f.set_conflicts([bi_d, bi_e, bi_f]) bi_g.set_conflicts([bi_f]) buffer_info_arr = [bi_a, bi_b, bi_c, bi_d, bi_e, bi_f, bi_g] fusmp_algo = tvm.get_global_func(f"tir.usmp.algo.{algorithm}") buffer_pool_allocations = fusmp_algo(buffer_info_arr, 0) _check_max_workspace_size(buffer_pool_allocations, global_workspace_pool, workspace_size)
def create_executor(kind="debug", mod=None, device=None, target="llvm", params=None): """Factory function to create an executor. Example ------- .. code-block:: python import tvm.relay import numpy as np x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32")) expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32")))) tvm.relay.create_executor( kind="vm", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr)) ).evaluate()(np.array([2], dtype="float32")) # returns `array([3.], dtype=float32)` Parameters ---------- kind : str The type of executor. Avaliable options are `debug` for the interpreter, `graph` for the graph executor, and `vm` for the virtual machine. mod : :py:class:`~tvm.IRModule` The Relay module containing collection of functions device : :py:class:`Device` The device to execute the code. target : :py:class:`tvm.Target` The corresponding context params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Returns ------- executor : :py:class:`~tvm.relay.backend.interpreter.Executor` """ if mod is None: mod = IRModule() if device is not None: assert device.device_type == _nd.device(str(target), 0).device_type else: device = _nd.device(str(target), 0) if params is not None: mod = IRModule.from_expr(bind_params_by_name(mod["main"], params)) if isinstance(target, str): target = Target(target) if kind == "debug": return _interpreter.Interpreter(mod, device, target) if kind == "graph": return GraphExecutor(mod, device, target) if kind == "vm": return VMExecutor(mod, device, target) raise RuntimeError("unknown execution strategy: {0}".format(kind))
def compile_model( tvmc_model: TVMCModel, target: str, tuning_records: Optional[str] = None, package_path: Optional[str] = None, cross: Optional[Union[str, Callable]] = None, cross_options: Optional[str] = None, export_format: str = "so", dump_code: Optional[List[str]] = None, target_host: Optional[str] = None, desired_layout: Optional[str] = None, disabled_pass: Optional[str] = None, ): """Compile a model from a supported framework into a TVM module. This function takes a union of the arguments of both frontends.load_model and compiler.compile_relay. The resulting TVM module can be executed using the graph executor. Parameters ---------- tvmc_model : TVMCModel The model object that should be compiled. target : str The target for which to compile. Can be a plain string or a path. tuning_records : str A path to tuning records produced using tvmc.tune. When provided, compilation will use more optimized kernels leading to better results. package_path : str, optional The path to export the compiled model to. If not provided it will be saved in a temporary directory. cross : str or callable object, optional Function that performs the actual compilation cross_options : str, optional Command line options to be passed to the cross compiler. export_format : str What format to use when saving the function library. Must be one of "so" or "tar". When compiling for a remote device without a cross compiler, "tar" will likely work better. dump_code : list, optional Dump the generated code for the specified source types, on the requested target. target_host : str, optional The target of the host machine if host-side code needs to be generated. desired_layout: str, optional The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. disabled_pass: str, optional Comma-separated list of passes which needs to be disabled during compilation Returns ------- compiled_model : TVMCPackage The compiled TVMCModel ready to be run. """ mod, params = tvmc_model.mod, tvmc_model.params config = {} if desired_layout: mod = common.convert_graph_layout(mod, desired_layout) tvm_target, extra_targets = common.target_from_cli(target) tvm_target, target_host = Target.check_and_update_host_consist( tvm_target, target_host) for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] mod = partition_function(mod, params, **codegen_from_cli["opts"]) if codegen["config_key"] is not None: config[codegen["config_key"]] = codegen_from_cli["opts"] if tuning_records and os.path.exists(tuning_records): logger.debug("tuning records file provided: %s", tuning_records) use_autoscheduler = True try: auto_scheduler.load_records(tuning_records) except tvm._ffi.base.TVMError: use_autoscheduler = False if use_autoscheduler: with auto_scheduler.ApplyHistoryBest(tuning_records): config["relay.backend.use_auto_scheduler"] = True with tvm.transform.PassContext(opt_level=3, config=config, disabled_pass=disabled_pass): logger.debug("building relay graph with autoscheduler") graph_module = relay.build(mod, target=tvm_target, params=params) else: with autotvm.apply_history_best(tuning_records): with tvm.transform.PassContext(opt_level=3, config=config, disabled_pass=disabled_pass): logger.debug("building relay graph with tuning records") graph_module = relay.build(mod, target=tvm_target, params=params) else: with tvm.transform.PassContext(opt_level=3, config=config, disabled_pass=disabled_pass): logger.debug("building relay graph (no tuning records provided)") graph_module = relay.build(mod, target=tvm_target, params=params) # Generate output dump files with sources if dump_code is None: dump_code = [] if not isinstance(dump_code, list): dump_code = [dump_code] dumps = {} for source_type in dump_code: lib = graph_module.get_lib() # TODO lib.get_source call have inconsistent behavior for unsupported # formats (@leandron). source = str(mod) if source_type == "relay" else lib.get_source( source_type) dumps[source_type] = source # Create a new tvmc model package object from the graph definition. package_path = tvmc_model.export_package(graph_module, package_path, cross, cross_options, export_format) # Write dumps to file. if dumps: save_dumps(package_path, dumps) return TVMCPackage(package_path)
def build( self, mod, target=None, target_host=None, params=None, executor="graph", mod_name=None ): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. executor: str[Optional] The type of executor to be used in order to run the model: - If "graph" is specified, then the graph_executor will be used - If "aot" is specified, then the aot_executor will be used mod_name: Optional[str] The module name we will build Returns ------- graph_json : str The json string that can be accepted by graph executor. mod : tvm.Module The module containing necessary libraries. params : dict The parameters of the final graph. """ target = build_target_by_device_type_map(target) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False ) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False ) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler mod_name = mangle_module_name(mod_name) self._build(mod, target, target_host, executor, mod_name) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts mod = self.get_module() params = self.get_params() executor_config = self.get_graph_json() if executor == "graph" else None return executor_config, mod, params
def enabled(): return "cmsis-nn" in Target.list_kinds()
def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"): # fmt: off # pylint: disable=line-too-long """Helper function that builds a Relay function to run on TVM graph executor. Parameters ---------- ir_mod : :py:class:`~tvm.IRModule` The IR module to build. Using relay.Function is deprecated. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. mod_name: Optional[str] The module name we will build Returns ------- factory_module : tvm.relay.backend.executor_factory.ExecutorFactoryModule The runtime factory for the TVM graph executor. """ # pylint: enable=line-too-long # fmt: on if not isinstance(ir_mod, (IRModule, _function.Function)): raise ValueError("Type of input parameter mod must be tvm.IRModule") if isinstance(ir_mod, _function.Function): if params: ir_mod = bind_params_by_name(ir_mod, params) ir_mod = IRModule.from_expr(ir_mod) warnings.warn( "Please use input parameter mod (tvm.IRModule) " "instead of deprecated parameter mod (tvm.relay.function.Function)", DeprecationWarning, ) target = build_target_by_device_type_map(target) if isinstance(target_host, (str, Target)): target_host = Target(target_host) elif target_host: raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None") target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False ) # Retrieve the executor from the target executor = get_executor_from_target(target, target_host) # If current dispatch context is fallback context (the default root context), # then load pre-tuned parameters from TopHub if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext): tophub_context = autotvm.tophub.context(list(target.values())) else: tophub_context = autotvm.utils.EmptyContext() with tophub_context: bld_mod = BuildModule() executor_config, runtime_mod, params = bld_mod.build( mod=ir_mod, target=target, params=params, executor=executor, mod_name=mod_name ) func_metadata = bld_mod.get_function_metadata() if executor == "aot": executor_factory = _executor_factory.AOTExecutorFactoryModule( ir_mod, target, runtime_mod, mod_name, params, func_metadata ) elif executor == "graph": executor_factory = _executor_factory.GraphExecutorFactoryModule( ir_mod, target, executor_config, runtime_mod, mod_name, params, func_metadata ) else: assert False, "Executor " + executor + " not supported" return executor_factory
def test_check_and_update_host_consist_0(): target = None host = None target, host = Target.check_and_update_host_consist(target, host)
def _target() -> Target: return Target("nvidia/geforce-rtx-3080")
def tune_model( tvmc_model: TVMCModel, target: str, tuning_records: Optional[str] = None, prior_records: Optional[str] = None, enable_autoscheduler: bool = False, rpc_key: Optional[str] = None, hostname: Optional[str] = None, port: Optional[Union[int, str]] = 9090, trials: int = 10000, target_host: Optional[str] = None, tuner: str = "xgb", min_repeat_ms: Optional[int] = None, early_stopping: Optional[int] = None, desired_layout: Optional[str] = None, timeout: int = 10, repeat: int = 1, number: int = 10, parallel: int = 4, hardware_params: Optional[HardwareParams] = None, include_simple_tasks: bool = False, log_estimated_latency: bool = False, additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None, ): """Use tuning to automatically optimize the functions in a model. Parameters ---------- tvmc_model : TVMCModel The model to be optimized. target : str Compilation target as plain string, inline JSON or path to a JSON file. tuning_records: str, optional The path to a file that tuning results will be saved to. If not specified, a temporary file will be used. prior_records: str, optional A path to previous tuning results that will be used to hot-start the tuning cost model if provided. enable_autoscheduler : bool, optional When true, use autoscheduling rather than autotvm. This should produce faster kernels for compatible model-target pairs. rpc_key : str, optional The RPC tracker key of the target device. Required when rpc_tracker is provided. hostname : str, optional The IP address of an RPC tracker, used when benchmarking remotely. port : int or str, optional The port of the RPC tracker to connect to. Defaults to 9090. trials : int, optional The number of schedules to try out for the entire model. Note that the default value is chosen as a decent average for most models, but larger models may need more trials to reach a good result while smaller models will converge with fewer trials. tuner : str, optional The type of tuner to use when tuning with autotvm. Can be one of "ga", "gridsearch", "random", "xgb", "xgb_knob", and "xgb-rank". min_repeat_ms : int, optional Minimum time to run each trial. Defaults to 0 on x86 and 1000 on other targets. early_stopping : int, optional When specified, stop tuning after this number of trials if results aren't improving. desired_layout : str, optional Can be one of "NCHW" or "NHWC". When specified, compatible operations in the graph will have their layout set to this format. Tasks will then be tuned using this specified layout. timeout : int, optional, If a kernel trial lasts longer than this duration in seconds, it will be considered a failure. repeat : int, optional How many times each measurement should be repeated. number : int, optional The number of runs a single repeat is made of. parallel : int, optional The maximum number of parallel devices to use when tuning. hardware_params : auto_scheduler.HardwareParams, optional When using the autoscheduler, this object defines the configuration of the target hardware. include_simple_tasks : bool, optional Whether to extract simple operations or only computationally intensive ones when using the autoscheduler. log_estimated_latency : bool, optional If using the autoscheduler, write the estimated latency at each step of tuning to file. additional_target_options: Optional[Dict[str, Dict[str, Any]]] Additional target options in a dictionary to combine with initial Target arguments Returns ------- tuning_records : str The path to the produced tuning log file. """ target, extra_targets = target_from_cli(target, additional_target_options) target, target_host = Target.check_and_update_host_consist( target, target_host) # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source # model is fixed. For now, creating a clone avoids the issue. mod = deepcopy(tvmc_model.mod) params = tvmc_model.params if tuning_records is None: tuning_records = tvmc_model.default_tuning_records_path() for codegen_from_cli in extra_targets: codegen = composite_target.get_codegen_by_target( codegen_from_cli["name"]) partition_function = codegen["pass_pipeline"] mod = partition_function(mod, params, **codegen_from_cli["opts"]) # min_repeat_ms should be: # a. the value provided by the user, if any, or # b. 0ms in case target is "cpu"; otherwise 1000ms if min_repeat_ms is None: min_repeat_ms = 0 if target.keys[0] == "cpu" else 1000 logger.info("Default --min-repeat-ms for this target is %s", min_repeat_ms) if rpc_key: if hostname is None or port is None: raise TVMCException( "You must provide a hostname and port to connect to a remote RPC device." ) if isinstance(port, str): port = int(port) logger.info("Tuning will be performed on device %s at %s:%d.", rpc_key, hostname, port) runner_ctor = auto_scheduler.RPCRunner if enable_autoscheduler else autotvm.RPCRunner runner = runner_ctor( key=rpc_key, host=hostname, port=port, number=number, repeat=repeat, n_parallel=parallel, timeout=timeout, min_repeat_ms=min_repeat_ms, ) else: logger.info("Starting localhost tuning.") runner_ctor = (auto_scheduler.LocalRPCMeasureContext if enable_autoscheduler else autotvm.LocalRunner) local_server = runner_ctor( number=number, repeat=repeat, timeout=timeout, min_repeat_ms=min_repeat_ms, ) # For autoscheduling on some devices, we need to maintain a LocalRPCMeasureContext object. if enable_autoscheduler: runner = local_server.runner else: runner = local_server if enable_autoscheduler: tasks, weights = autoscheduler_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, hardware_params=hardware_params, include_simple_tasks=include_simple_tasks, ) # Create the autoscheduler tuning options tuning_options = auto_scheduler.TuningOptions( num_measure_trials=trials, measure_callbacks=[auto_scheduler.RecordToFile(tuning_records)], runner=runner, early_stopping=early_stopping, ) logger.info("Autoscheduling with configuration: %s", tuning_options) # Schedule the tasks (i.e., produce a schedule for each task) schedule_tasks(tasks, weights, tuning_options, prior_records, log_estimated_latency) else: tasks = autotvm_get_tuning_tasks( mod=mod, params=params, target=target, alter_layout=desired_layout, ) # In autotvm, trials is specified per task. We can convert the per-model input # provided to per-task trials by dividing by the number of tasks. trials = int(trials / len(tasks)) logger.info("Autotuning with %d trials per task.", trials) tuning_options = { "tuner": tuner, "trials": trials, "early_stopping": early_stopping, "measure_option": autotvm.measure_option( builder=autotvm.LocalBuilder(build_func="default"), runner=runner), "tuning_records": prior_records, } logger.info("Autotuning with configuration: %s", tuning_options) tune_tasks(tasks, tuning_records, **tuning_options) return tuning_records
def _build_for_device(input_mod, target, target_host): """Build the lowered functions for a device with the given compilation target. Parameters ---------- input_mod : IRModule The schedule to be built. target : str or :any:`tvm.target.Target` The target and option of the compilation. target_host : str or :any:`tvm.target.Target` The host compilation target. Returns ------- fhost : IRModule The host IRModule. mdev : tvm.module A module that contains device code. """ target = Target(target) target_host = Target(target_host) device_type = ndarray.context(target.kind.name, 0).device_type mod_mixed = input_mod mod_mixed = tvm.tir.transform.Apply( lambda f: f.with_attr("target", target))(mod_mixed) opt_mixed = [tvm.tir.transform.VerifyMemory()] if len(mod_mixed.functions) == 1: opt_mixed += [ tvm.tir.transform.Apply( lambda f: f.with_attr("tir.is_entry_func", True)) ] if PassContext.current().config.get("tir.detect_global_barrier", False): opt_mixed += [tvm.tir.transform.ThreadSync("global")] opt_mixed += [ tvm.tir.transform.ThreadSync("shared"), tvm.tir.transform.ThreadSync("warp"), tvm.tir.transform.InferFragment(), tvm.tir.transform.LowerThreadAllreduce(), tvm.tir.transform.MakePackedAPI(), tvm.tir.transform.SplitHostDevice(), ] mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed) # device optimizations opt_device = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" in f.attrs and f.attrs[ "calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.LowerWarpMemory(), tvm.tir.transform.Simplify(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), ]) mod_dev = opt_device(mod_mixed) # host optimizations opt_host = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" not in f.attrs or f.attrs[ "calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)), tvm.tir.transform.LowerTVMBuiltin(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), tvm.tir.transform.CombineContextCall(), ]) mod_host = opt_host(mod_mixed) if device_type == ndarray.cpu(0).device_type and target_host == target: assert len(mod_dev.functions) == 0 if "gpu" in target.keys and len(mod_dev.functions) == 0: warnings.warn( "Specified target %s, but cannot find device code, did you do " "bind?" % target) rt_mod_dev = codegen.build_module( mod_dev, target) if len(mod_dev.functions) != 0 else None return mod_host, rt_mod_dev
def build( self, mod, target=None, target_host=None, executor=Executor("graph"), runtime=Runtime("cpp"), workspace_memory_pools=None, params=None, mod_name=None, ): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm interpreter is used. executor : Optional[Executor] The executor configuration with which to build the model. Defaults to "graph" if no executor specified. runtime : Optional[Runtime] Runtime configuration to use when building the model. Defaults to "cpp" if no runtime specified. workspace_memory_pools : Optional[WorkspaceMemoryPools] The object that contains an Array of PoolInfo objects that hold properties of workspace pools that could be used by the inference. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. mod_name: Optional[str] The module name we will build Returns ------- graph_json : str The json string that can be accepted by graph executor. mod : tvm.Module The module containing necessary libraries. params : dict The parameters of the final graph. """ if target_host is not None: warnings.warn( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) target = build_target_by_device_type_map(target) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False ) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False ) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler mod_name = mangle_module_name(mod_name) self._build(mod, target, target_host, executor, runtime, workspace_memory_pools, mod_name) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts mod = self.get_module() params = self.get_params() executor_config = self.get_graph_json() if str(executor) == "graph" else None return executor_config, mod, params
def build(inputs, args=None, target=None, target_host=None, name="default_function", binds=None): """Build a function with arguments as signature. Code will be generated for devices coupled with target information. Parameters ---------- inputs : tvm.te.Schedule, IRModule, or dict of target to IRModule The schedule to be built args : list of Buffer or Tensor or Var, optional The argument lists to the function. target : str or :any:`tvm.target.Target`, optional The target and option of the compilation. target_host : str or :any:`tvm.target.Target` optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. name : str, optional The name of result function. binds : dict, optional Dictionary that maps the binding of symbolic buffer to Tensor. By default, a new buffer is created for each tensor in the argument. Returns ------- ret : tvm.module A module that combines both host and device code. Examples ________ There are two typical example uses of this function depending on the type of the argument `inputs`: 1. it is an IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.te.create_schedule(C.op) m = tvm.lower(s, [A, B, C], name="test_add") rt_mod = tvm.build(m, target="llvm") 2. it is a dict of compilation target to IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s1 = tvm.te.create_schedule(C.op) with tvm.target.cuda() as cuda_tgt: s2 = topi.cuda.schedule_injective(cuda_tgt, [C]) m1 = tvm.lower(s1, [A, B, C], name="test_add1") m2 = tvm.lower(s2, [A, B, C], name="test_add2") rt_mod = tvm.build({"llvm": m1, "cuda": m2}, target_host="llvm") Note ---- See the note on :any:`tvm.target` on target string format. """ if isinstance(inputs, schedule.Schedule): if args is None: raise ValueError("args must be given for build from schedule") input_mod = lower(inputs, args, name=name, binds=binds) elif isinstance(inputs, (list, tuple, container.Array)): merged_mod = tvm.IRModule({}) for x in inputs: merged_mod.update(x) input_mod = merged_mod elif isinstance(inputs, tvm.IRModule): input_mod = inputs elif not isinstance(inputs, (dict, container.Map)): raise ValueError( f"Inputs must be Schedule, IRModule or dict of target to IRModule, " f"but got {type(inputs)}.") if not isinstance(inputs, (dict, container.Map)): target = Target.current() if target is None else target target = target if target else "llvm" target_input_mod = {target: input_mod} else: target_input_mod = inputs for tar, mod in target_input_mod.items(): if not isinstance(tar, (str, Target)): raise ValueError("The key of inputs must be str or " "Target when inputs is dict.") if not isinstance(mod, tvm.IRModule): raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.") if not target_host: for tar, _ in target_input_mod.items(): tar = Target(tar) device_type = ndarray.context(tar.kind.name, 0).device_type if device_type == ndarray.cpu(0).device_type: target_host = tar break if not target_host: target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm" mod_host_all = tvm.IRModule({}) device_modules = [] for tar, input_mod in target_input_mod.items(): mod_host, mdev = _build_for_device(input_mod, tar, target_host) mod_host_all.update(mod_host) device_modules.append(mdev) # Generate a unified host module. rt_mod_host = codegen.build_module(mod_host_all, target_host) # Import all modules. for mdev in device_modules: if mdev: rt_mod_host.import_module(mdev) if not isinstance(target_host, Target): target_host = Target(target_host) if (target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c" and target_host.attrs.get("system-lib", 0).value == 1): if target_host.kind.name == "c": create_csource_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateCSourceCrtMetadataModule") return create_csource_crt_metadata_module([rt_mod_host], target_host) if target_host.kind.name == "llvm": create_llvm_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateLLVMCrtMetadataModule") return create_llvm_crt_metadata_module([rt_mod_host], target_host) return rt_mod_host
def build( inputs: Union[schedule.Schedule, PrimFunc, IRModule, Mapping[str, IRModule]], args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None, target: Optional[Union[str, Target]] = None, target_host: Optional[Union[str, Target]] = None, name: Optional[str] = "default_function", binds: Optional[Mapping[tensor.Tensor, Buffer]] = None, ): """Build a function with arguments as signature. Code will be generated for devices coupled with target information. Parameters ---------- inputs : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]] The input to be built args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, Var]]] The argument lists to the function. target : Optional[Union[str, Target]] The target and option of the compilation. target_host : Optional[Union[str, Target]] Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm interpreter is used. name : Optional[str] The name of result function. binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]] Dictionary that maps the binding of symbolic buffer to Tensor. By default, a new buffer is created for each tensor in the argument. Returns ------- ret : tvm.module A module that combines both host and device code. Examples ________ There are two typical example uses of this function depending on the type of the argument `inputs`: 1. it is an IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.te.create_schedule(C.op) m = tvm.lower(s, [A, B, C], name="test_add") rt_mod = tvm.build(m, target="llvm") 2. it is a dict of compilation target to IRModule. .. code-block:: python n = 2 A = te.placeholder((n,), name='A') B = te.placeholder((n,), name='B') C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s1 = tvm.te.create_schedule(C.op) with tvm.target.cuda() as cuda_tgt: s2 = topi.cuda.schedule_injective(cuda_tgt, [C]) m1 = tvm.lower(s1, [A, B, C], name="test_add1") m2 = tvm.lower(s2, [A, B, C], name="test_add2") rt_mod = tvm.build({"llvm": m1, "cuda": m2}) Note ---- See the note on :any:`tvm.target` on target string format. """ if isinstance(inputs, schedule.Schedule): if args is None: raise ValueError("args must be given for build from schedule") input_mod = lower(inputs, args, name=name, binds=binds) elif isinstance(inputs, (list, tuple, container.Array)): merged_mod = tvm.IRModule({}) for x in inputs: merged_mod.update(lower(x)) input_mod = merged_mod elif isinstance(inputs, (tvm.IRModule, PrimFunc)): input_mod = lower(inputs) elif not isinstance(inputs, (dict, container.Map)): raise ValueError( f"Inputs must be Schedule, IRModule or dict of target to IRModule, " f"but got {type(inputs)}.") if target_host is not None: warnings.warn( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) if not isinstance(inputs, (dict, container.Map)): target = Target.current() if target is None else target target = target if target else "llvm" target_input_mod = {target: input_mod} else: target_input_mod = inputs for tar, mod in target_input_mod.items(): if not isinstance(tar, (str, Target)): raise ValueError("The key of inputs must be str or " "Target when inputs is dict.") if not isinstance(mod, tvm.IRModule): raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.") target_input_mod, target_host = Target.check_and_update_host_consist( target_input_mod, target_host) if not target_host: for tar, mod in target_input_mod.items(): tar = Target(tar) device_type = ndarray.device(tar.kind.name, 0).device_type if device_type == ndarray.cpu(0).device_type: target_host = tar break if not target_host: target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm" target_input_mod, target_host = Target.check_and_update_host_consist( target_input_mod, target_host) rt_mod_host = _driver_ffi.preprocess_module(target_input_mod, target_host) target_input_mod, target_host = Target.check_and_update_host_consist( target_input_mod, target_host) if not isinstance(target_host, Target): target_host = Target(target_host) if (target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c" and target_host.attrs.get("system-lib", 0) == 1): if target_host.kind.name == "c": create_csource_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateCSourceCrtMetadataModule") to_return = create_csource_crt_metadata_module([rt_mod_host], target_host) elif target_host.kind.name == "llvm": create_llvm_crt_metadata_module = tvm._ffi.get_global_func( "runtime.CreateLLVMCrtMetadataModule") to_return = create_llvm_crt_metadata_module([rt_mod_host], target_host) else: to_return = rt_mod_host return OperatorModule.from_module(to_return, ir_module_by_target=target_input_mod, name=name)
def measure_candidates(database, builder, runner): """Send the candidates to builder and runner for distributed measurement, and save the results in a new json database. Parameters ---------- database : JSONDatabase The database for candidates to be measured. builder : Builder The builder for building the candidates. runner : Runner The runner for measuring the candidates. Returns ------- None """ candidates, runner_results, build_fail_indices, run_fail_indices = [], [], [], [] context = ms.TuneContext(target=Target(args.target)) tuning_records = database.get_all_tuning_records() for record in tuning_records: candidates.append(record.as_measure_candidate()) with ms.Profiler() as profiler: for idx in range(0, len(candidates), args.batch_size): batch_candidates = candidates[idx:idx + args.batch_size] context._set_measure_candidates(batch_candidates) # pylint: disable=protected-access with ms.Profiler.timeit("build"): context._send_to_builder(builder) # pylint: disable=protected-access with ms.Profiler.timeit("run"): context._send_to_runner(runner) # pylint: disable=protected-access batch_runner_results = context._join() # pylint: disable=protected-access runner_results.extend(batch_runner_results) for i, result in enumerate(context.builder_results): if result.error_msg is None: ms.utils.remove_build_dir(result.artifact_path) else: build_fail_indices.append(i + idx) context._clear_measure_state() # pylint: disable=protected-access model_name, workload_name = database.path_workload.split("/")[-2:] record_name = database.path_tuning_record.split("/")[-1] new_database = ms.database.JSONDatabase( path_workload=os.path.join(args.result_cache_dir, model_name, workload_name), path_tuning_record=os.path.join(args.result_cache_dir, model_name, record_name), ) workload = tuning_records[0].workload new_database.commit_workload(workload.mod) for i, (record, result) in enumerate(zip(tuning_records, runner_results)): if result.error_msg is None: new_database.commit_tuning_record( ms.database.TuningRecord( trace=record.trace, workload=workload, run_secs=[v.value for v in result.run_secs], target=Target(args.target), )) else: run_fail_indices.append(i) fail_indices_name = workload_name.replace("_workload.json", "_failed_indices.txt") with open(os.path.join(args.result_cache_dir, model_name, fail_indices_name), "w", encoding="utf8") as file: file.write(" ".join([str(n) for n in run_fail_indices])) print( f"Builder time: {profiler.get()['build']}, Runner time: {profiler.get()['run']}\n\ Failed number of builds: {len(build_fail_indices)},\ Failed number of runs: {len(run_fail_indices)}")