def is_auto_scheduler_enabled(): """Return whether the auto-scheduler is enabled. Parameters ---------- enabled: bool Whether the auto-scheduler is enabled """ return PassContext.current().config.get( "relay.backend.use_auto_scheduler", False, ) or PassContext.current().config.get( "relay.backend.use_meta_schedule", False, )
def default_build(mod: IRModule, target: Target, _params: Optional[Dict[str, NDArray]]) -> Module: """Default build function. Parameters ---------- mod : IRModule The IRModule to be built. target : Target The target to be built. _params : Optional[Dict[str, NDArray]] The parameters to be used for the build. Must be None. Returns ------- rt_mod : Module The built Module. """ # pylint: disable=import-outside-toplevel from tvm.driver import build as tvm_build from tvm.ir.transform import PassContext # pylint: enable=import-outside-toplevel with PassContext(disabled_pass=["tir.CommonSubexprElimTIR"]): return tvm_build(mod, target=target)
def transform_module(self, mod, _): """Invokes the pass""" # TODO(@jroesch): Is there a way to do one shot initialization? # can we have def pass_init? mod.import_from_std("core.rly") mod = InferType()(mod) assert isinstance(self.targets, (dict, container.Map)) if len(self.targets) > 1: pass_ctx = PassContext.current() if "relay.fallback_device_type" in pass_ctx.config: fallback_ctx = nd.context( pass_ctx.config["relay.fallback_device_type"]) else: fallback_ctx = cpu(0) ca = context_analysis(mod, TVMContext(fallback_ctx.device_type, 0)) else: if isinstance(self.targets, dict): dev = list(self.targets.keys())[0] else: dev, _ = self.targets.items()[0] ca = context_analysis(mod, nd.context(dev.value)) # The following code can be used for debugging the module after # annotation. # print(mod.astext(show_meta_data=False, annotate=mk_analysis_annotator(ca))) gv_funcs = mod.functions for gv, f in gv_funcs.items(): ea = ManifestAllocPass(self.target_host, ca) f = ea.visit(f) mod.update_func(gv, f) return mod
def build(self, mod, target=None, target_host=None, params=None): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. Returns ------- graph_json : str The json string that can be accepted by graph runtime. mod : tvm.Module The module containing necessary libraries. params : dict The parameters of the final graph. """ target = _update_target(target) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler self._build(mod, target, target_host) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts graph_json = self.get_json() mod = self.get_module() params = self.get_params() return graph_json, mod, params
def build_tir_func(func): func = func.with_attr("global_symbol", "main") pass_ctx = PassContext.current() if pass_ctx.config.get("tir.noalias", True): func = func.with_attr("tir.noalias", True) mod = tvm.IRModule({"main": func}) func = tvm.build(mod) return func
def build_config(opt_level=2, fallback_device=_nd.cpu(), required_pass=None, disabled_pass=None, trace=None): """Configure the build behavior by setting config variables. Parameters ---------- opt_level: int, optional Optimization level. The optimization pass name and level are as the following: .. code-block:: python OPT_PASS_LEVEL = { "SimplifyInference": 0, "OpFusion": 1, "FoldConstant": 2, "FoldScaleAxis": 3, "AlterOpLayout": 3, "CanonicalizeOps": 3, "CanonicalizeCast": 3, "EliminateCommonSubexpr": 3, "CombineParallelConv2D": 4, "CombineParallelDense": 4, "FastMath": 4 } fallback_device : int, str, or tvmContext, optional The fallback device. It is also used as the default device for operators without specified device during heterogeneous execution. required_pass: set of str, optional Optimization passes that are required regardless of optimization level. disabled_pass: set of str, optional Optimization passes to be disabled during optimization. trace: Callable[[IRModule, PassInfo, bool], None] A tracing function for debugging or introspection. Returns ------- pass_context: PassContext The pass context for optimizations. """ return PassContext(opt_level, fallback_device, required_pass, disabled_pass, trace)
def tune_relay_auto( mod: IRModule, target: Union[str, Target], config: TuneConfig, work_dir: str, backend: str = "graph", params: Optional[Dict[str, NDArray]] = None, ) -> Union[Module, vm.Executable]: """A wrapper of `tune_relay` but provide a default setting for the config. Parameters ---------- mod : IRModule The module to tune. target : Union[str, Target] The target to tune for. config : TuneConfig The search strategy config. params : Optional[Dict[str, tvm.runtime.NDArray]] The associated parameters of the program work_dir : Optional[str] The working directory to save intermediate results. backend : str = "graph" The backend to use for relay compilation(graph / vm). Returns ------- lib : Union[Module, tvm.runtime.vm.Executable] The built runtime module or vm Executable for the given relay workload. """ target = default_config.target(target) extracted_tasks = extract_task_from_relay(mod, target, params) if config is None: config = TuneConfig( num_trials_per_iter=16, max_trials_global=16 * len(extracted_tasks), ) database = tune_extracted_tasks(extracted_tasks, config, work_dir) relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend] with target, autotvm_silencer(), ApplyHistoryBest(database): with PassContext( opt_level=3, config={ "relay.backend.use_meta_schedule": True, "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda", }, ): return relay_build(mod, target=target, params=params)
def __call__(self, measure_input, tmp_dir, **kwargs): instrument = PassInstrumentChecker() mocked_pass_checker = GPUVerifyPassMocked() with mocked_pass_checker: with PassContext(instruments=[instrument]): regular_result = super().__call__(measure_input, tmp_dir, **kwargs) # Check instrument has been run, meaning context was inherited by builder assert instrument.has_been_run # But also check the gpu verification pass has been run # (which was not in the inherited ctx) assert mocked_pass_checker.has_been_run return regular_result
def tune_each_task( mod, target, config, runner, work_dir, params, ): extracted_tasks = ms.extract_task_from_relay(mod, target, params) database = ms.database.JSONDatabase( path_workload=os.path.join(work_dir, "default_database_workload.json"), path_tuning_record=os.path.join(work_dir, "default_database_tuning_record.json"), ) for task in extracted_tasks: # pylint: disable=protected-access tune_context = ms.tune.Parse._tune_context( tune_context=None, mod=ms.tune.Parse._mod(task.dispatched[0]), target=target, config=config, task_name=task.task_name, space_generator=None, sch_rules=None, postprocs=None, mutator_probs=None, num_threads=os.cpu_count(), ) task_scheduler = ms.tune.Parse._task_scheduler( None, [tune_context], task_weights=[1.0], builder=ms.tune.Parse._builder(None), runner=ms.tune.Parse._runner(runner), database=database, max_trials=config.max_trials_per_task, cost_model=ms.tune.Parse._cost_model(None), measure_callbacks=ms.tune.Parse._callbacks(None), ) # pylint: enable=protected-access task_scheduler.tune() with target, ms.ApplyHistoryBest(database): with PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): return relay_build(mod, target=target, params=params)
def _generate_codegen_args(parser, codegen_name): codegen = get_codegen_by_target(codegen_name) pass_configs = PassContext.list_configs() if codegen["config_key"] is not None and codegen["config_key"] in pass_configs: target_group = parser.add_argument_group(f"target {codegen_name}") attrs = make_node(pass_configs[codegen["config_key"]]["type"]) fields = attrs_api.AttrsListFieldInfo(attrs) for field in fields: for tvm_type, python_type in INTERNAL_TO_NATIVE_TYPE.items(): if field.type_info.startswith(tvm_type): target_option = field.name target_group.add_argument( f"--target-{codegen_name}-{target_option}", type=python_type, help=f"target {codegen_name} {target_option}{python_type}", )
def _reconstruct_codegen_args(args, codegen_name): codegen = get_codegen_by_target(codegen_name) pass_configs = PassContext.list_configs() codegen_options = {} if codegen["config_key"] is not None and codegen["config_key"] in pass_configs: attrs = make_node(pass_configs[codegen["config_key"]]["type"]) fields = attrs_api.AttrsListFieldInfo(attrs) for field in fields: for tvm_type in INTERNAL_TO_NATIVE_TYPE: if field.type_info.startswith(tvm_type): target_option = field.name var_name = ( f"target_{codegen_name.replace('-', '_')}_{target_option.replace('-', '_')}" ) option_value = getattr(args, var_name) if option_value is not None: codegen_options[target_option] = option_value return codegen_options
def build_relay_with_tensorrt( mod: "IRModule", target: "Target", params: Dict[str, "NDArray"], ) -> "Module": """Build a Relay IRModule with TensorRT BYOC Parameters ---------- mod : IRModule The Relay IRModule to build. target : Target The target to build the module for. params : Dict[str, NDArray] The parameter dict to build the module with. Returns ------- mod : runtime.Module The built module. """ from tvm.ir.transform import PassContext from tvm.relay.build_module import _build_module_no_factory as relay_build from tvm.relay.op.contrib import tensorrt from tvm.runtime import Module mod, config = tensorrt.partition_for_tensorrt(mod, params) with PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}, ): result = relay_build(mod, target=target, target_host=None, params=params) assert isinstance(result, Module) return result
def form_irmodule(sch, args, name, binds): """According to the given schedule, form a function. Parameters ---------- sch : tvm.te.schedule.Schedule The given scheduler to form the raw body args : list of Buffer or Tensor or Var The argument lists to the function. name : str The name of result function. binds : dict of :any:`Tensor` to :any:`Buffer`, optional The binds information Returns ------- The body formed according to the given schedule """ # normalize schedule first pass_ctx = PassContext.current() sch = sch.normalize() bounds = schedule.InferBound(sch) stmt = schedule.ScheduleOps(sch, bounds) compact = schedule.VerifyCompactBuffer(stmt) binds, arg_list = get_binds(args, compact, binds) stmt = schedule.SchedulePostProcRewriteForTensorCore(stmt, sch, binds) func = schedule.SchedulePostProcToPrimFunc(arg_list, stmt, binds) func = func.with_attr("global_symbol", name) if pass_ctx.config.get("tir.noalias", True): func = func.with_attr("tir.noalias", True) return tvm.IRModule({name: func})
def partition_for_cutlass(mod, params=None): """Partition the input module into CUTLASS-supported subgraphs.""" if params is not None: mod["main"] = bind_params_by_name(mod["main"], params) remove_bn_pass = Sequential([ transform.InferType(), transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), ]) with PassContext(opt_level=3): mod = remove_bn_pass(mod) cutlass_patterns = relay.op.contrib.get_pattern_table("cutlass") seq = Sequential([ transform.InferType(), transform.MergeComposite(cutlass_patterns), transform.AnnotateTarget(["cutlass"], include_non_call_ops=False), transform.PartitionGraph(bind_constants=False), ]) return seq(mod)
def conv2d_winograd_without_weight_transfrom_strategy_cuda( attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cuda strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") assert dilation == (1, 1), "Do not support dilate now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nchw_winograd_without_weight_transform), name="conv2d_nchw_winograd_without_weight_transform.cuda", ) elif layout == "NHWC": N, H, W, _ = get_const_tuple(data.shape) alpha, _, CI, CO = get_const_tuple(kernel.shape) dilation_h, dilation_w = dilation judge_winograd_tensorcore, _, _ = judge_winograd( N, H, W, alpha, alpha, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=True, ) if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and judge_winograd_tensorcore): strategy.add_implementation( wrap_compute_conv2d( topi.cuda. conv2d_nhwc_winograd_tensorcore_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform ), name= "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda. conv2d_nhwc_winograd_direct_without_weight_transform), wrap_topi_schedule( topi.cuda. schedule_conv2d_nhwc_winograd_direct_without_weight_transform ), name= "conv2d_nhwc_winograd_direct_without_weight_transform.cuda", ) if PassContext.current().config.get("relay.backend.use_auto_scheduler", False): strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc_winograd_without_weight_transform", plevel=15, ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) return strategy
def _build_for_device(input_mod, target, target_host): """Build the lowered functions for a device with the given compilation target. Parameters ---------- input_mod : IRModule The schedule to be built. target : str or :any:`tvm.target.Target` The target and option of the compilation. target_host : str or :any:`tvm.target.Target` The host compilation target. Returns ------- fhost : IRModule The host IRModule. mdev : tvm.module A module that contains device code. """ target, target_host = Target.check_and_update_host_consist( target, target_host) device_type = ndarray.device(target.kind.name, 0).device_type mod_mixed = input_mod mod_mixed = tvm.tir.transform.Apply( lambda f: f.with_attr("target", target))(mod_mixed) opt_mixed = [tvm.tir.transform.VerifyMemory()] if len(mod_mixed.functions) == 1: opt_mixed += [ tvm.tir.transform.Apply( lambda f: f.with_attr("tir.is_entry_func", True)) ] if PassContext.current().config.get("tir.detect_global_barrier", False): opt_mixed += [tvm.tir.transform.ThreadSync("global")] opt_mixed += [ tvm.tir.transform.ThreadSync("shared"), tvm.tir.transform.ThreadSync("warp"), tvm.tir.transform.InferFragment(), tvm.tir.transform.LowerThreadAllreduce(), tvm.tir.transform.MakePackedAPI(), tvm.tir.transform.SplitHostDevice(), ] mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed) # device optimizations opt_device = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" in f.attrs and f.attrs[ "calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.LowerWarpMemory(), tvm.tir.transform.Simplify(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), ]) mod_dev = opt_device(mod_mixed) # host optimizations opt_host = tvm.transform.Sequential([ tvm.tir.transform.Filter( lambda f: "calling_conv" not in f.attrs or f.attrs[ "calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH), tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)), tvm.tir.transform.LowerTVMBuiltin(), tvm.tir.transform.LowerDeviceStorageAccessInfo(), tvm.tir.transform.LowerCustomDatatypes(), tvm.tir.transform.LowerIntrin(), tvm.tir.transform.CombineContextCall(), ]) mod_host = opt_host(mod_mixed) if device_type == ndarray.cpu(0).device_type and target_host == target: assert len(mod_dev.functions) == 0 if "gpu" in target.keys and len(mod_dev.functions) == 0: warnings.warn( "Specified target %s, but cannot find device code, did you do " "bind?" % target) rt_mod_dev = codegen.build_module( mod_dev, target) if len(mod_dev.functions) != 0 else None return mod_host, rt_mod_dev
def lower(sch, args, name="main", binds=None, simple_mode=False): """Lowering step before build into target. Parameters ---------- sch : tvm.te.schedule.Schedule The schedule to be built args : list of Buffer or Tensor or Var The argument lists to the function. name : str, optional The name of result function. binds : dict of :any:`Tensor` to :any:`Buffer`, optional Dictionary that maps the Tensor to Buffer which specified the data layout requirement of the function. By default, a new compact buffer is created for each tensor in the argument. simple_mode : bool, optional Whether only output simple and compact statement, this will skip LoopPartition, api wrapper generation and Unrolling. Returns ------- m : IRModule or Stmt The result IRModule, if simple_mode=False Then the Stmt before make api is returned. """ # config setup pass_ctx = PassContext.current() instrument_bound_checkers = bool( pass_ctx.config.get("tir.instrument_bound_checkers", False)) disable_vectorize = bool( pass_ctx.config.get("tir.disable_vectorize", False)) add_lower_pass = pass_ctx.config.get("tir.add_lower_pass", []) lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0] lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1] lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2] lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2] # Phase 0 if isinstance(sch, schedule.Schedule): mod = form_irmodule(sch, args, name, binds) else: mod = sch pass_list = lower_phase0 # Phase 1 pass_list += [ tvm.tir.transform.InjectPrefetch(), tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers), tvm.tir.transform.BF16Legalize(), tvm.tir.transform.NarrowDataType(32), tvm.tir.transform.Simplify(), ] pass_list += lower_phase1 # Phase 2 if not simple_mode: pass_list += [(tvm.tir.transform.LoopPartition())] pass_list += [ tvm.tir.transform.VectorizeLoop(not disable_vectorize), tvm.tir.transform.InjectVirtualThread(), tvm.tir.transform.InjectDoubleBuffer(), tvm.tir.transform.StorageRewrite(), tvm.tir.transform.UnrollLoop(), ] pass_list += lower_phase2 # Phase 3 pass_list += [ tvm.tir.transform.Simplify(), tvm.tir.transform.RemoveNoOp(), ] pass_list += [tvm.tir.transform.RewriteUnsafeSelect()] pass_list += [tvm.tir.transform.HoistIfThenElse()] pass_list += lower_phase3 # Instrument BoundCheckers if instrument_bound_checkers: pass_list += [tvm.tir.transform.InstrumentBoundCheckers()] optimize = tvm.transform.Sequential(pass_list) mod = optimize(mod) return mod
def tune_relay( mod: IRModule, target: Union[str, Target], config: TuneConfig, work_dir: str, *, params: Optional[Dict[str, NDArray]] = None, builder: Optional[Builder] = None, runner: Optional[Runner] = None, database: Optional[Database] = None, cost_model: Optional[CostModel] = None, measure_callbacks: Optional[List[MeasureCallback]] = None, space: Optional[FnSpaceGenerator] = None, sch_rules: Optional[FnScheduleRule] = None, postprocs: Optional[FnPostproc] = None, mutator_probs: Optional[FnMutatorProb] = None, num_threads: Optional[int] = None, ) -> Module: """Tune a TIR IRModule with a given target. Parameters ---------- mod : IRModule The module to tune. target : Union[str, Target] The target to tune for. config : TuneConfig The search strategy config. params : Optional[Dict[str, tvm.runtime.NDArray]] The associated parameters of the program task_name : str The name of the task. work_dir : Optional[str] The working directory to save intermediate results. builder : Optional[Builder] The builder to use. runner : Optional[Runner] The runner to use. database : Optional[Database] The database to use. measure_callbacks : Optional[List[MeasureCallback]] The callbacks used during tuning. Returns ------- lib : Module The built runtime module for the given relay workload. """ # pylint: disable=import-outside-toplevel from tvm.relay import build as relay_build from .relay_integration import extract_task_from_relay # pylint: disable=protected-access, enable=import-outside-toplevel target = Parse._target(target) # pylint: enable=protected-access, # parse the tuning contexts extracted_tasks = extract_task_from_relay(mod, target, params) database = tune_extracted_tasks( extracted_tasks, config, work_dir, builder=builder, runner=runner, database=database, cost_model=cost_model, measure_callbacks=measure_callbacks, space=space, sch_rules=sch_rules, postprocs=postprocs, mutator_probs=mutator_probs, num_threads=num_threads, ) with target, autotvm_silencer(), ApplyHistoryBest(database): with PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): return relay_build(mod, target=target, params=params)
def lower( inputs: Union[schedule.Schedule, PrimFunc, IRModule], args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None, name: str = "main", binds: Optional[Mapping[tensor.Tensor, Buffer]] = None, simple_mode: bool = False, ) -> IRModule: """Lowering step before build into target. Parameters ---------- input : Union[schedule.Schedule, PrimFunc, IRModule] The TE schedule or TensorIR PrimFunc/IRModule to be built args : Optional[List[Union[Buffer, tensor.Tensor, Var]]] The argument lists to the function for TE schedule. It should be None if we want to lower TensorIR. name : str The name of result function. binds : Optional[Mapping[tensor.Tensor, Buffer]] Dictionary that maps the Tensor to Buffer which specified the data layout requirement of the function. By default, a new compact buffer is created for each tensor in the argument. simple_mode : bool Whether only output simple and compact statement, this will skip LoopPartition, api wrapper generation and Unrolling. Returns ------- m : IRModule The result IRModule """ # config setup pass_ctx = PassContext.current() instrument_bound_checkers = bool( pass_ctx.config.get("tir.instrument_bound_checkers", False)) disable_vectorize = bool( pass_ctx.config.get("tir.disable_vectorize", False)) add_lower_pass = pass_ctx.config.get("tir.add_lower_pass", []) lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0] lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1] lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2] lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2] # Phase 0 pass_list = lower_phase0 is_legacy_te_schedule: bool = False if isinstance(inputs, schedule.Schedule): if args is None: raise ValueError( "args must be given for lowering from TE schedule") mod = form_irmodule(inputs, args, name, binds) is_legacy_te_schedule = True elif isinstance(inputs, PrimFunc): func = inputs.with_attr("global_symbol", name) if pass_ctx.config.get("tir.noalias", True): func = func.with_attr("tir.noalias", True) mod = tvm.IRModule({name: func}) elif isinstance(inputs, IRModule): mod = inputs else: raise TypeError( f"tvm.lower expected te.Schedule, PrimFunc or IRModule, but got {type(inputs)}" ) # Phase 1 if is_legacy_te_schedule: pass_list += [ tvm.tir.transform.InjectPrefetch(), tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers), ] else: pass_list += [ tvm.tir.transform.LowerInitBlock(), tvm.tir.transform.PlanAndUpdateBufferAllocationLocation(), tvm.tir.transform.ConvertBlocksToOpaque(), tvm.tir.transform.CompactBufferAllocation(), tvm.tir.transform.FlattenBuffer(), ] pass_list += [ tvm.tir.transform.BF16Legalize(), tvm.tir.transform.NarrowDataType(32), tvm.tir.transform.Simplify(), ] pass_list += lower_phase1 # Phase 2 if not simple_mode: pass_list += [(tvm.tir.transform.LoopPartition())] pass_list += [ tvm.tir.transform.VectorizeLoop(not disable_vectorize), tvm.tir.transform.InjectVirtualThread(), tvm.tir.transform.InjectDoubleBuffer(), tvm.tir.transform.StorageRewrite(), tvm.tir.transform.UnrollLoop(), ] pass_list += lower_phase2 # Phase 3 pass_list += [ tvm.tir.transform.Simplify(), tvm.tir.transform.RemoveNoOp(), ] pass_list += [tvm.tir.transform.RewriteUnsafeSelect()] pass_list += [tvm.tir.transform.HoistIfThenElse()] pass_list += lower_phase3 # Instrument BoundCheckers if instrument_bound_checkers: pass_list += [tvm.tir.transform.InstrumentBoundCheckers()] optimize = tvm.transform.Sequential(pass_list) mod = optimize(mod) return mod
def build(self, mod, target=None, target_host=None, params=None, executor="graph", mod_name=None): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm intepreter is used. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. executor: str[Optional] The type of executor to be used in order to run the model: - If "graph" is specified, then the graph_executor will be used - If "aot" is specified, then the aot_executor will be used mod_name: Optional[str] The module name we will build Returns ------- graph_json : str The json string that can be accepted by graph executor. mod : tvm.Module The module containing necessary libraries. params : dict The parameters of the final graph. """ target = build_target_by_device_type_map(target) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler mod_name = mangle_module_name(mod_name) self._build(mod, target, target_host, executor, mod_name) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts mod = self.get_module() params = self.get_params() executor_config = self.get_graph_json( ) if executor == "graph" else None return executor_config, mod, params
def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True): """Select the best implementation from the op strategy. If use_autotvm is True, it'll first try to find the best implementation based on AutoTVM profile results. If no AutoTVM profile result is found, it'll choose the implementation with highest plevel. If use_autotvm is False, it'll directly choose the implementation with highest plevel. Note that this function doesn't support op with symbolic input shapes. Parameters ---------- op : tvm.ir.Op Relay operator. attrs : object The op attribute. inputs : List[tvm.te.Tensor] Input tensors to the op. out_type : relay.Type The output type. target : tvm.target.Target The target to compile the op. use_autotvm : bool Whether query AutoTVM to pick the best. Returns ------- ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor]) The best op implementation and the corresponding output tensors. """ all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) best_plevel_impl = max(all_impls, key=lambda x: x.plevel) # Disable autotvm if auto_scheduler is enabled. # (i.e., always return the implementation with the highest priority for auto-scheduler). if PassContext.current().config.get("relay.backend.use_auto_scheduler", False): use_autotvm = False # If not use autotvm, always return the implementation with the highest priority if not use_autotvm: logger.info( "Using %s for %s based on highest priority (%d)", best_plevel_impl.name, op.name, best_plevel_impl.plevel, ) outs = best_plevel_impl.compute(attrs, inputs, out_type) return best_plevel_impl, outs # Otherwise, try autotvm templates outputs = {} workloads = {} best_autotvm_impl = None best_cfg = None dispatch_ctx = autotvm.task.DispatchContext.current old_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = True for impl in all_impls: outs = impl.compute(attrs, inputs, out_type) outputs[impl] = outs workload = autotvm.task.get_workload(outs) workloads[impl] = workload if workload is None: # Not an AutoTVM tunable implementation continue cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: # Skip fallback config continue logger.info("Implementation %s for %s has cost %.2e", impl.name, op.name, cfg.cost) if best_cfg is None or best_cfg.cost > cfg.cost: best_autotvm_impl = impl best_cfg = cfg autotvm.GLOBAL_SCOPE.silent = old_silent if best_autotvm_impl: # The best autotvm implementation definitely doesn't use fallback config logger.info( "Using %s for %s based on lowest cost (%.2e)", best_autotvm_impl.name, op.name, best_cfg.cost, ) return best_autotvm_impl, outputs[best_autotvm_impl] # Use the implementation with highest plevel if workloads[best_plevel_impl] is not None: msg = ( "Cannot find tuning records for:\n target=%s\n key=%s\n" "TVM will apply a default schedule which may negatively impact performance." % (target, workloads[best_plevel_impl])) if (not autotvm.env.GLOBAL_SCOPE.silent and msg not in autotvm.task.DispatchContext.warning_messages): autotvm.task.DispatchContext.warning_messages.add(msg) global _first_warning if _first_warning: _first_warning = False info_msg = ( "One or more operators have not been tuned. Please tune your model " "for better performance. Use DEBUG logging level to see more details." ) autotvm_logger.warning(info_msg) autotvm_logger.debug(msg) logger.info( "Using %s for %s based on highest priority (%s)", best_plevel_impl.name, op.name, best_plevel_impl.plevel, ) return best_plevel_impl, outputs[best_plevel_impl]
def build( self, mod, target=None, target_host=None, executor=Executor("graph"), runtime=Runtime("cpp"), workspace_memory_pools=None, params=None, mod_name=None, ): """ Parameters ---------- mod : :py:class:`~tvm.IRModule` The IRModule to build. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional For heterogeneous compilation, it is a dictionary indicating context to target mapping. For homogeneous compilation, it is a build target. target_host : str or :any:`tvm.target.Target`, optional Host compilation target, if target is device. When TVM compiles device specific program such as CUDA, we also need host(CPU) side code to interact with the driver to setup the dimensions and parameters correctly. target_host is used to specify the host side codegen target. By default, llvm is used if it is enabled, otherwise a stackvm interpreter is used. executor : Optional[Executor] The executor configuration with which to build the model. Defaults to "graph" if no executor specified. runtime : Optional[Runtime] Runtime configuration to use when building the model. Defaults to "cpp" if no runtime specified. workspace_memory_pools : Optional[WorkspaceMemoryPools] The object that contains an Array of PoolInfo objects that hold properties of workspace pools that could be used by the inference. params : dict of str to NDArray Input parameters to the graph that do not change during inference time. Used for constant folding. mod_name: Optional[str] The module name we will build Returns ------- graph_json : str The json string that can be accepted by graph executor. mod : tvm.Module The module containing necessary libraries. params : dict The parameters of the final graph. """ if target_host is not None: warnings.warn( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) target = build_target_by_device_type_map(target) target, target_host = Target.check_and_update_host_consist( target, target_host, target_is_dict_key=False) # Setup the params. if params: self._set_params(params) # Build the IR module. If auto_scheduler is not enabled, # then use the TOPI-defined schedule. use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) # Turn off AutoTVM config not found warnings if auto_scheduler is enabled. old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler or old_autotvm_silent mod_name = mangle_module_name(mod_name) self._build(mod, target, target_host, executor, runtime, workspace_memory_pools, mod_name) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent # Get artifacts mod = self.get_module() params = self.get_params() executor_config = self.get_graph_json() if str( executor) == "graph" else None return executor_config, mod, params
def partition_for_cutlass(mod, params=None): """Partition the input module into CUTLASS-supported subgraphs.""" dense_pat = ("cutlass.dense", make_gemm_pattern(False, None), check_gemm) dense_bias_pat = ("cutlass.dense_bias", make_gemm_pattern(True, None), check_gemm) dense_bias_relu_pat = ("cutlass.dense_bias_relu", make_gemm_pattern(True, "relu"), check_gemm) dense_bias_gelu_fp16_pat = ( "cutlass.dense_bias_gelu_fp16", make_gemm_pattern(True, "gelu"), check_gemm, ) dense_bias_gelu_fp32_pat = ( "cutlass.dense_bias_gelu_fp32", make_gemm_pattern(True, "gelu", out_dtype="float32"), check_gemm, ) dense_patterns = [ dense_bias_gelu_fp16_pat, dense_bias_gelu_fp32_pat, dense_bias_relu_pat, dense_bias_pat, dense_pat, ("cutlass.batch_matmul", make_batch_matmul_pattern(), check_batch_matmul), ] conv2d_patterns = [ ( "cutlass.conv2d_bias_hardswish", make_conv2d_pattern(with_bias=True, with_act="hardswish"), check_conv2d, ), ( "cutlass.conv2d_bias_silu", make_conv2d_pattern(with_bias=True, with_act="silu"), check_conv2d, ), ( "cutlass.conv2d_bias_relu", make_conv2d_pattern(with_bias=True, with_act="relu"), check_conv2d, ), ( "cutlass.conv2d_bias_sigmoid", make_conv2d_pattern(with_bias=True, with_act="sigmoid"), check_conv2d, ), ("cutlass.conv2d_bias", make_conv2d_pattern(with_bias=True), check_conv2d), ("cutlass.conv2d", make_conv2d_pattern(), check_conv2d), ] residual_block_patterns = [] for with_act, postfix in [("relu", "_relu"), (None, "")]: for name, pat, _ in conv2d_patterns[:-1]: for bin_op in ["add", "multiply"]: residual_block_patterns.append(( name + "_residual_" + bin_op + postfix, make_residual_block_pattern(pat, bin_op, with_act=with_act), partial(check_conv2d_residual, binary_op=bin_op), )) cutlass_patterns = residual_block_patterns + dense_patterns + conv2d_patterns if params is not None: mod["main"] = bind_params_by_name(mod["main"], params) remove_bn_pass = Sequential([ transform.InferType(), transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), ]) with PassContext(opt_level=3): mod = remove_bn_pass(mod) seq = Sequential([ transform.InferType(), transform.MergeComposite(cutlass_patterns), transform.AnnotateTarget(["cutlass"], include_non_call_ops=False), transform.PartitionGraph(bind_constants=False), ]) return seq(mod)
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", "uint8"): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda", ) _, _, kh, kw = get_const_tuple(kernel.shape) if ((2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5, ) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), name="conv2d_nhwc.cuda", ) N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) # Winograd shape related judgment ( judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler, ) = judge_winograd( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=False, ) if judge_winograd_autotvm: if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and judge_winograd_tensorcore): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore ), name="conv2d_nhwc_winograd_tensorcore.cuda", plevel=5, ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5, ) if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20, ) # register auto-scheduler implementations use_auto_scheduler = PassContext.current().config.get( "relay.backend.use_auto_scheduler", False) if use_auto_scheduler and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif layout == "HWNC": assert kernel_layout in [ "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i" ] _, _, N, in_channels = get_const_tuple(data.shape) pre_computed = len(kernel.shape) == 6 if pre_computed: _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple( kernel.shape) out_channels = oc_chunk * oc_block_factor else: _, _, out_channels, _ = get_const_tuple(kernel.shape) tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"] if ((N % 16 == 0 and in_channels % 16 == 0 and out_channels % 16 == 0) or (N % 8 == 0 and in_channels % 16 == 0 and out_channels % 32 == 0) or (N % 32 == 0 and in_channels % 16 == 0 and out_channels % 8 == 0) and (data.dtype in tensorcore_dtypes and kernel.dtype in tensorcore_dtypes)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_hwnc_tensorcore), name="conv2d_hwnc_tensorcore_direct.cuda", plevel=20, ) else: raise RuntimeError("Unsupported shape for conv2d HWNC.\ Need to satisfy tensor core schedule.") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda", ) else: raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) # add cudnn implementation if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in [ "NCHW", "NHWC" ] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda", ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d # add cudnn implementation, if any cudnn_impl = False if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in [ "NCHW", "NHWC" ] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) cudnn_impl = True if layout == "NCHW": # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda", ) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda", ) elif not cudnn_impl: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy
def partition_for_cutlass(mod, params=None): """Partition the input module into CUTLASS-supported subgraphs.""" dense_pat = ("cutlass.dense", make_gemm_pattern(False, None), check_gemm) dense_bias_pat = ("cutlass.dense_bias", make_gemm_pattern(True, None), check_gemm) dense_bias_relu_pat = ("cutlass.dense_bias_relu", make_gemm_pattern(True, "relu"), check_gemm) dense_bias_gelu_fp16_pat = ( "cutlass.dense_bias_gelu_fp16", make_gemm_pattern(True, "gelu"), check_gemm, ) dense_bias_gelu_fp32_pat = ( "cutlass.dense_bias_gelu_fp32", make_gemm_pattern(True, "gelu", out_dtype="float32"), check_gemm, ) cutlass_patterns = [ dense_bias_gelu_fp16_pat, dense_bias_gelu_fp32_pat, dense_bias_relu_pat, dense_bias_pat, dense_pat, ("cutlass.batch_matmul", make_batch_matmul_pattern(), check_batch_matmul), ( "cutlass.conv2d_bias_hardswish", make_conv2d_pattern(with_bias=True, with_act="hardswish"), check_conv2d, ), ( "cutlass.conv2d_bias_silu", make_conv2d_pattern(with_bias=True, with_act="silu"), check_conv2d, ), ( "cutlass.conv2d_bias_relu", make_conv2d_pattern(with_bias=True, with_act="relu"), check_conv2d, ), ( "cutlass.conv2d_bias_sigmoid", make_conv2d_pattern(with_bias=True, with_act="sigmoid"), check_conv2d, ), ("cutlass.conv2d_bias", make_conv2d_pattern(with_bias=True), check_conv2d), ("cutlass.conv2d", make_conv2d_pattern(), check_conv2d), ] if params is not None: mod["main"] = bind_params_by_name(mod["main"], params) remove_bn_pass = Sequential([ transform.InferType(), transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), ]) with PassContext(opt_level=3): mod = remove_bn_pass(mod) seq = Sequential([ transform.InferType(), transform.MergeComposite(cutlass_patterns), transform.AnnotateTarget(["cutlass"], include_non_call_ops=False), transform.PartitionGraph(bind_constants=False), ]) return seq(mod)