示例#1
0
def is_auto_scheduler_enabled():
    """Return whether the auto-scheduler is enabled.

    Parameters
    ----------
    enabled: bool
        Whether the auto-scheduler is enabled
    """
    return PassContext.current().config.get(
        "relay.backend.use_auto_scheduler",
        False,
    ) or PassContext.current().config.get(
        "relay.backend.use_meta_schedule",
        False,
    )
示例#2
0
def default_build(mod: IRModule, target: Target,
                  _params: Optional[Dict[str, NDArray]]) -> Module:
    """Default build function.

    Parameters
    ----------
    mod : IRModule
        The IRModule to be built.
    target : Target
        The target to be built.
    _params : Optional[Dict[str, NDArray]]
        The parameters to be used for the build. Must be None.

    Returns
    -------
    rt_mod : Module
        The built Module.
    """
    # pylint: disable=import-outside-toplevel
    from tvm.driver import build as tvm_build
    from tvm.ir.transform import PassContext

    # pylint: enable=import-outside-toplevel
    with PassContext(disabled_pass=["tir.CommonSubexprElimTIR"]):
        return tvm_build(mod, target=target)
    def transform_module(self, mod, _):
        """Invokes the pass"""
        # TODO(@jroesch): Is there a way to do one shot initialization?
        # can we have def pass_init?
        mod.import_from_std("core.rly")
        mod = InferType()(mod)

        assert isinstance(self.targets, (dict, container.Map))
        if len(self.targets) > 1:
            pass_ctx = PassContext.current()
            if "relay.fallback_device_type" in pass_ctx.config:
                fallback_ctx = nd.context(
                    pass_ctx.config["relay.fallback_device_type"])
            else:
                fallback_ctx = cpu(0)
            ca = context_analysis(mod, TVMContext(fallback_ctx.device_type, 0))
        else:
            if isinstance(self.targets, dict):
                dev = list(self.targets.keys())[0]
            else:
                dev, _ = self.targets.items()[0]
            ca = context_analysis(mod, nd.context(dev.value))

        # The following code can be used for debugging the module after
        # annotation.
        # print(mod.astext(show_meta_data=False, annotate=mk_analysis_annotator(ca)))

        gv_funcs = mod.functions
        for gv, f in gv_funcs.items():
            ea = ManifestAllocPass(self.target_host, ca)
            f = ea.visit(f)
            mod.update_func(gv, f)
        return mod
示例#4
0
    def build(self, mod, target=None, target_host=None, params=None):
        """
        Parameters
        ----------
        mod : :py:class:`~tvm.IRModule`
            The IRModule to build.

        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
        device/context name) to str/tvm.target.Target, optional
            For heterogeneous compilation, it is a dictionary indicating context
            to target mapping. For homogeneous compilation, it is a build target.

        target_host : str or :any:`tvm.target.Target`, optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            to setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm intepreter is used.

        params : dict of str to NDArray
            Input parameters to the graph that do not change
            during inference time. Used for constant folding.

        Returns
        -------
        graph_json : str
            The json string that can be accepted by graph runtime.

        mod : tvm.Module
            The module containing necessary libraries.

        params : dict
            The parameters of the final graph.
        """
        target = _update_target(target)

        # Setup the params.
        if params:
            self._set_params(params)

        # Build the IR module. If auto_scheduler is not enabled,
        # then use the TOPI-defined schedule.
        use_auto_scheduler = PassContext.current().config.get(
            "relay.backend.use_auto_scheduler", False)

        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler

        self._build(mod, target, target_host)
        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent

        # Get artifacts
        graph_json = self.get_json()
        mod = self.get_module()
        params = self.get_params()

        return graph_json, mod, params
示例#5
0
def build_tir_func(func):
    func = func.with_attr("global_symbol", "main")
    pass_ctx = PassContext.current()
    if pass_ctx.config.get("tir.noalias", True):
        func = func.with_attr("tir.noalias", True)
    mod = tvm.IRModule({"main": func})
    func = tvm.build(mod)
    return func
示例#6
0
def build_config(opt_level=2,
                 fallback_device=_nd.cpu(),
                 required_pass=None,
                 disabled_pass=None,
                 trace=None):
    """Configure the build behavior by setting config variables.

    Parameters
    ----------
    opt_level: int, optional
        Optimization level. The optimization pass name and level are as the
        following:

        .. code-block:: python

            OPT_PASS_LEVEL = {
                "SimplifyInference": 0,
                "OpFusion": 1,
                "FoldConstant": 2,
                "FoldScaleAxis": 3,
                "AlterOpLayout": 3,
                "CanonicalizeOps": 3,
                "CanonicalizeCast": 3,
                "EliminateCommonSubexpr": 3,
                "CombineParallelConv2D": 4,
                "CombineParallelDense": 4,
                "FastMath": 4
            }

    fallback_device : int, str, or tvmContext, optional
        The fallback device. It is also used as the default device for
        operators without specified device during heterogeneous execution.

    required_pass: set of str, optional
        Optimization passes that are required regardless of optimization level.

    disabled_pass: set of str, optional
        Optimization passes to be disabled during optimization.

    trace: Callable[[IRModule, PassInfo, bool], None]
        A tracing function for debugging or introspection.

    Returns
    -------
    pass_context: PassContext
        The pass context for optimizations.
    """
    return PassContext(opt_level, fallback_device, required_pass,
                       disabled_pass, trace)
示例#7
0
def tune_relay_auto(
    mod: IRModule,
    target: Union[str, Target],
    config: TuneConfig,
    work_dir: str,
    backend: str = "graph",
    params: Optional[Dict[str, NDArray]] = None,
) -> Union[Module, vm.Executable]:
    """A wrapper of `tune_relay` but provide a default setting for the config.

    Parameters
    ----------
    mod : IRModule
        The module to tune.
    target : Union[str, Target]
        The target to tune for.
    config : TuneConfig
        The search strategy config.
    params : Optional[Dict[str, tvm.runtime.NDArray]]
        The associated parameters of the program
    work_dir : Optional[str]
        The working directory to save intermediate results.
    backend : str = "graph"
        The backend to use for relay compilation(graph / vm).

    Returns
    -------
    lib : Union[Module, tvm.runtime.vm.Executable]
        The built runtime module or vm Executable for the given relay workload.
    """
    target = default_config.target(target)
    extracted_tasks = extract_task_from_relay(mod, target, params)
    if config is None:
        config = TuneConfig(
            num_trials_per_iter=16,
            max_trials_global=16 * len(extracted_tasks),
        )
    database = tune_extracted_tasks(extracted_tasks, config, work_dir)
    relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend]
    with target, autotvm_silencer(), ApplyHistoryBest(database):
        with PassContext(
            opt_level=3,
            config={
                "relay.backend.use_meta_schedule": True,
                "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda",
            },
        ):
            return relay_build(mod, target=target, params=params)
示例#8
0
        def __call__(self, measure_input, tmp_dir, **kwargs):
            instrument = PassInstrumentChecker()
            mocked_pass_checker = GPUVerifyPassMocked()
            with mocked_pass_checker:
                with PassContext(instruments=[instrument]):
                    regular_result = super().__call__(measure_input, tmp_dir,
                                                      **kwargs)

                    # Check instrument has been run, meaning context was inherited by builder
                    assert instrument.has_been_run

                    # But also check the gpu verification pass has been run
                    # (which was not in the inherited ctx)
                    assert mocked_pass_checker.has_been_run

                    return regular_result
示例#9
0
def tune_each_task(
    mod,
    target,
    config,
    runner,
    work_dir,
    params,
):
    extracted_tasks = ms.extract_task_from_relay(mod, target, params)
    database = ms.database.JSONDatabase(
        path_workload=os.path.join(work_dir, "default_database_workload.json"),
        path_tuning_record=os.path.join(work_dir, "default_database_tuning_record.json"),
    )
    for task in extracted_tasks:
        # pylint: disable=protected-access
        tune_context = ms.tune.Parse._tune_context(
            tune_context=None,
            mod=ms.tune.Parse._mod(task.dispatched[0]),
            target=target,
            config=config,
            task_name=task.task_name,
            space_generator=None,
            sch_rules=None,
            postprocs=None,
            mutator_probs=None,
            num_threads=os.cpu_count(),
        )
        task_scheduler = ms.tune.Parse._task_scheduler(
            None,
            [tune_context],
            task_weights=[1.0],
            builder=ms.tune.Parse._builder(None),
            runner=ms.tune.Parse._runner(runner),
            database=database,
            max_trials=config.max_trials_per_task,
            cost_model=ms.tune.Parse._cost_model(None),
            measure_callbacks=ms.tune.Parse._callbacks(None),
        )
        # pylint: enable=protected-access
        task_scheduler.tune()
    with target, ms.ApplyHistoryBest(database):
        with PassContext(
            opt_level=3,
            config={"relay.backend.use_meta_schedule": True},
        ):
            return relay_build(mod, target=target, params=params)
示例#10
0
文件: target.py 项目: wenxcs/tvm
def _generate_codegen_args(parser, codegen_name):
    codegen = get_codegen_by_target(codegen_name)
    pass_configs = PassContext.list_configs()

    if codegen["config_key"] is not None and codegen["config_key"] in pass_configs:
        target_group = parser.add_argument_group(f"target {codegen_name}")
        attrs = make_node(pass_configs[codegen["config_key"]]["type"])
        fields = attrs_api.AttrsListFieldInfo(attrs)
        for field in fields:
            for tvm_type, python_type in INTERNAL_TO_NATIVE_TYPE.items():
                if field.type_info.startswith(tvm_type):
                    target_option = field.name
                    target_group.add_argument(
                        f"--target-{codegen_name}-{target_option}",
                        type=python_type,
                        help=f"target {codegen_name} {target_option}{python_type}",
                    )
示例#11
0
文件: target.py 项目: wenxcs/tvm
def _reconstruct_codegen_args(args, codegen_name):
    codegen = get_codegen_by_target(codegen_name)
    pass_configs = PassContext.list_configs()
    codegen_options = {}

    if codegen["config_key"] is not None and codegen["config_key"] in pass_configs:
        attrs = make_node(pass_configs[codegen["config_key"]]["type"])
        fields = attrs_api.AttrsListFieldInfo(attrs)
        for field in fields:
            for tvm_type in INTERNAL_TO_NATIVE_TYPE:
                if field.type_info.startswith(tvm_type):
                    target_option = field.name
                    var_name = (
                        f"target_{codegen_name.replace('-', '_')}_{target_option.replace('-', '_')}"
                    )
                    option_value = getattr(args, var_name)
                    if option_value is not None:
                        codegen_options[target_option] = option_value
    return codegen_options
示例#12
0
def build_relay_with_tensorrt(
    mod: "IRModule",
    target: "Target",
    params: Dict[str, "NDArray"],
) -> "Module":
    """Build a Relay IRModule with TensorRT BYOC

    Parameters
    ----------
    mod : IRModule
        The Relay IRModule to build.

    target : Target
        The target to build the module for.

    params : Dict[str, NDArray]
        The parameter dict to build the module with.

    Returns
    -------
    mod : runtime.Module
        The built module.
    """
    from tvm.ir.transform import PassContext
    from tvm.relay.build_module import _build_module_no_factory as relay_build
    from tvm.relay.op.contrib import tensorrt
    from tvm.runtime import Module

    mod, config = tensorrt.partition_for_tensorrt(mod, params)
    with PassContext(
            opt_level=3,
            config={"relay.ext.tensorrt.options": config},
    ):
        result = relay_build(mod,
                             target=target,
                             target_host=None,
                             params=params)
    assert isinstance(result, Module)
    return result
示例#13
0
def form_irmodule(sch, args, name, binds):
    """According to the given schedule, form a function.

    Parameters
    ----------
    sch : tvm.te.schedule.Schedule
        The given scheduler to form the raw body

    args : list of Buffer or Tensor or Var
        The argument lists to the function.

    name : str
        The name of result function.

    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
        The binds information

    Returns
    -------
    The body formed according to the given schedule
    """
    # normalize schedule first
    pass_ctx = PassContext.current()
    sch = sch.normalize()
    bounds = schedule.InferBound(sch)
    stmt = schedule.ScheduleOps(sch, bounds)

    compact = schedule.VerifyCompactBuffer(stmt)
    binds, arg_list = get_binds(args, compact, binds)

    stmt = schedule.SchedulePostProcRewriteForTensorCore(stmt, sch, binds)
    func = schedule.SchedulePostProcToPrimFunc(arg_list, stmt, binds)

    func = func.with_attr("global_symbol", name)

    if pass_ctx.config.get("tir.noalias", True):
        func = func.with_attr("tir.noalias", True)
    return tvm.IRModule({name: func})
示例#14
0
def partition_for_cutlass(mod, params=None):
    """Partition the input module into CUTLASS-supported subgraphs."""

    if params is not None:
        mod["main"] = bind_params_by_name(mod["main"], params)
        remove_bn_pass = Sequential([
            transform.InferType(),
            transform.SimplifyInference(),
            transform.FoldConstant(),
            transform.FoldScaleAxis(),
        ])
        with PassContext(opt_level=3):
            mod = remove_bn_pass(mod)

    cutlass_patterns = relay.op.contrib.get_pattern_table("cutlass")

    seq = Sequential([
        transform.InferType(),
        transform.MergeComposite(cutlass_patterns),
        transform.AnnotateTarget(["cutlass"], include_non_call_ops=False),
        transform.PartitionGraph(bind_constants=False),
    ])

    return seq(mod)
示例#15
0
def conv2d_winograd_without_weight_transfrom_strategy_cuda(
        attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom cuda strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    assert dilation == (1, 1), "Do not support dilate now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    if layout == "NCHW":
        strategy.add_implementation(
            wrap_compute_conv2d(
                topi.cuda.conv2d_nchw_winograd_without_weight_transform),
            wrap_topi_schedule(
                topi.cuda.
                schedule_conv2d_nchw_winograd_without_weight_transform),
            name="conv2d_nchw_winograd_without_weight_transform.cuda",
        )
    elif layout == "NHWC":
        N, H, W, _ = get_const_tuple(data.shape)
        alpha, _, CI, CO = get_const_tuple(kernel.shape)
        dilation_h, dilation_w = dilation
        judge_winograd_tensorcore, _, _ = judge_winograd(
            N,
            H,
            W,
            alpha,
            alpha,
            CI,
            CO,
            padding,
            stride_h,
            stride_w,
            dilation_h,
            dilation_w,
            data.dtype,
            kernel.dtype,
            pre_flag=True,
        )
        if (target.kind.name == "cuda"
                and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                and judge_winograd_tensorcore):
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.
                    conv2d_nhwc_winograd_tensorcore_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.
                    schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform
                ),
                name=
                "conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
            )
        else:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.
                    conv2d_nhwc_winograd_direct_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.
                    schedule_conv2d_nhwc_winograd_direct_without_weight_transform
                ),
                name=
                "conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
            )

        if PassContext.current().config.get("relay.backend.use_auto_scheduler",
                                            False):
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.nn.conv2d_winograd_nhwc_without_weight_transform),
                naive_schedule,  # this implementation should never be picked by autotvm
                name="conv2d_nhwc_winograd_without_weight_transform",
                plevel=15,
            )
    else:
        raise RuntimeError(
            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".
            format(layout))
    return strategy
示例#16
0
def _build_for_device(input_mod, target, target_host):
    """Build the lowered functions for a device with the given compilation
    target.

    Parameters
    ----------
    input_mod : IRModule
        The schedule to be built.

    target : str or :any:`tvm.target.Target`
        The target and option of the compilation.

    target_host : str or :any:`tvm.target.Target`
        The host compilation target.

    Returns
    -------
    fhost : IRModule
        The host IRModule.

    mdev : tvm.module
        A module that contains device code.
    """
    target, target_host = Target.check_and_update_host_consist(
        target, target_host)
    device_type = ndarray.device(target.kind.name, 0).device_type

    mod_mixed = input_mod
    mod_mixed = tvm.tir.transform.Apply(
        lambda f: f.with_attr("target", target))(mod_mixed)

    opt_mixed = [tvm.tir.transform.VerifyMemory()]
    if len(mod_mixed.functions) == 1:
        opt_mixed += [
            tvm.tir.transform.Apply(
                lambda f: f.with_attr("tir.is_entry_func", True))
        ]

    if PassContext.current().config.get("tir.detect_global_barrier", False):
        opt_mixed += [tvm.tir.transform.ThreadSync("global")]
    opt_mixed += [
        tvm.tir.transform.ThreadSync("shared"),
        tvm.tir.transform.ThreadSync("warp"),
        tvm.tir.transform.InferFragment(),
        tvm.tir.transform.LowerThreadAllreduce(),
        tvm.tir.transform.MakePackedAPI(),
        tvm.tir.transform.SplitHostDevice(),
    ]
    mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed)

    # device optimizations
    opt_device = tvm.transform.Sequential([
        tvm.tir.transform.Filter(
            lambda f: "calling_conv" in f.attrs and f.attrs[
                "calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH),
        tvm.tir.transform.LowerWarpMemory(),
        tvm.tir.transform.Simplify(),
        tvm.tir.transform.LowerDeviceStorageAccessInfo(),
        tvm.tir.transform.LowerCustomDatatypes(),
        tvm.tir.transform.LowerIntrin(),
    ])
    mod_dev = opt_device(mod_mixed)

    # host optimizations
    opt_host = tvm.transform.Sequential([
        tvm.tir.transform.Filter(
            lambda f: "calling_conv" not in f.attrs or f.attrs[
                "calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH),
        tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)),
        tvm.tir.transform.LowerTVMBuiltin(),
        tvm.tir.transform.LowerDeviceStorageAccessInfo(),
        tvm.tir.transform.LowerCustomDatatypes(),
        tvm.tir.transform.LowerIntrin(),
        tvm.tir.transform.CombineContextCall(),
    ])
    mod_host = opt_host(mod_mixed)

    if device_type == ndarray.cpu(0).device_type and target_host == target:
        assert len(mod_dev.functions) == 0
    if "gpu" in target.keys and len(mod_dev.functions) == 0:
        warnings.warn(
            "Specified target %s, but cannot find device code, did you do "
            "bind?" % target)

    rt_mod_dev = codegen.build_module(
        mod_dev, target) if len(mod_dev.functions) != 0 else None
    return mod_host, rt_mod_dev
示例#17
0
def lower(sch, args, name="main", binds=None, simple_mode=False):
    """Lowering step before build into target.

    Parameters
    ----------
    sch : tvm.te.schedule.Schedule
        The schedule to be built

    args : list of Buffer or Tensor or Var
        The argument lists to the function.

    name : str, optional
        The name of result function.

    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
        Dictionary that maps the Tensor to Buffer which specified the data layout
        requirement of the function. By default, a new compact buffer is created
        for each tensor in the argument.

    simple_mode : bool, optional
        Whether only output simple and compact statement, this will skip
        LoopPartition, api wrapper generation and Unrolling.

    Returns
    -------
    m : IRModule or Stmt
       The result IRModule, if simple_mode=False
       Then the Stmt before make api is returned.
    """
    # config setup
    pass_ctx = PassContext.current()
    instrument_bound_checkers = bool(
        pass_ctx.config.get("tir.instrument_bound_checkers", False))
    disable_vectorize = bool(
        pass_ctx.config.get("tir.disable_vectorize", False))
    add_lower_pass = pass_ctx.config.get("tir.add_lower_pass", [])

    lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0]
    lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1]
    lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2]
    lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2]

    # Phase 0
    if isinstance(sch, schedule.Schedule):
        mod = form_irmodule(sch, args, name, binds)
    else:
        mod = sch

    pass_list = lower_phase0
    # Phase 1
    pass_list += [
        tvm.tir.transform.InjectPrefetch(),
        tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers),
        tvm.tir.transform.BF16Legalize(),
        tvm.tir.transform.NarrowDataType(32),
        tvm.tir.transform.Simplify(),
    ]
    pass_list += lower_phase1

    # Phase 2
    if not simple_mode:
        pass_list += [(tvm.tir.transform.LoopPartition())]

    pass_list += [
        tvm.tir.transform.VectorizeLoop(not disable_vectorize),
        tvm.tir.transform.InjectVirtualThread(),
        tvm.tir.transform.InjectDoubleBuffer(),
        tvm.tir.transform.StorageRewrite(),
        tvm.tir.transform.UnrollLoop(),
    ]
    pass_list += lower_phase2

    # Phase 3
    pass_list += [
        tvm.tir.transform.Simplify(),
        tvm.tir.transform.RemoveNoOp(),
    ]

    pass_list += [tvm.tir.transform.RewriteUnsafeSelect()]
    pass_list += [tvm.tir.transform.HoistIfThenElse()]
    pass_list += lower_phase3

    # Instrument BoundCheckers
    if instrument_bound_checkers:
        pass_list += [tvm.tir.transform.InstrumentBoundCheckers()]

    optimize = tvm.transform.Sequential(pass_list)
    mod = optimize(mod)
    return mod
示例#18
0
def tune_relay(
    mod: IRModule,
    target: Union[str, Target],
    config: TuneConfig,
    work_dir: str,
    *,
    params: Optional[Dict[str, NDArray]] = None,
    builder: Optional[Builder] = None,
    runner: Optional[Runner] = None,
    database: Optional[Database] = None,
    cost_model: Optional[CostModel] = None,
    measure_callbacks: Optional[List[MeasureCallback]] = None,
    space: Optional[FnSpaceGenerator] = None,
    sch_rules: Optional[FnScheduleRule] = None,
    postprocs: Optional[FnPostproc] = None,
    mutator_probs: Optional[FnMutatorProb] = None,
    num_threads: Optional[int] = None,
) -> Module:
    """Tune a TIR IRModule with a given target.

    Parameters
    ----------
    mod : IRModule
        The module to tune.
    target : Union[str, Target]
        The target to tune for.
    config : TuneConfig
        The search strategy config.
    params : Optional[Dict[str, tvm.runtime.NDArray]]
        The associated parameters of the program
    task_name : str
        The name of the task.
    work_dir : Optional[str]
        The working directory to save intermediate results.
    builder : Optional[Builder]
        The builder to use.
    runner : Optional[Runner]
        The runner to use.
    database : Optional[Database]
        The database to use.
    measure_callbacks : Optional[List[MeasureCallback]]
        The callbacks used during tuning.

    Returns
    -------
    lib : Module
        The built runtime module for the given relay workload.
    """
    # pylint: disable=import-outside-toplevel
    from tvm.relay import build as relay_build

    from .relay_integration import extract_task_from_relay

    # pylint: disable=protected-access, enable=import-outside-toplevel
    target = Parse._target(target)
    # pylint: enable=protected-access,
    # parse the tuning contexts
    extracted_tasks = extract_task_from_relay(mod, target, params)
    database = tune_extracted_tasks(
        extracted_tasks,
        config,
        work_dir,
        builder=builder,
        runner=runner,
        database=database,
        cost_model=cost_model,
        measure_callbacks=measure_callbacks,
        space=space,
        sch_rules=sch_rules,
        postprocs=postprocs,
        mutator_probs=mutator_probs,
        num_threads=num_threads,
    )
    with target, autotvm_silencer(), ApplyHistoryBest(database):
        with PassContext(
                opt_level=3,
                config={"relay.backend.use_meta_schedule": True},
        ):
            return relay_build(mod, target=target, params=params)
示例#19
0
def lower(
    inputs: Union[schedule.Schedule, PrimFunc, IRModule],
    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
    name: str = "main",
    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
    simple_mode: bool = False,
) -> IRModule:
    """Lowering step before build into target.

    Parameters
    ----------
    input : Union[schedule.Schedule, PrimFunc, IRModule]
        The TE schedule or TensorIR PrimFunc/IRModule to be built

    args : Optional[List[Union[Buffer, tensor.Tensor, Var]]]
        The argument lists to the function for TE schedule.
        It should be None if we want to lower TensorIR.

    name : str
        The name of result function.

    binds : Optional[Mapping[tensor.Tensor, Buffer]]
        Dictionary that maps the Tensor to Buffer which specified the data layout
        requirement of the function. By default, a new compact buffer is created
        for each tensor in the argument.

    simple_mode : bool
        Whether only output simple and compact statement, this will skip
        LoopPartition, api wrapper generation and Unrolling.

    Returns
    -------
    m : IRModule
       The result IRModule
    """
    # config setup
    pass_ctx = PassContext.current()
    instrument_bound_checkers = bool(
        pass_ctx.config.get("tir.instrument_bound_checkers", False))
    disable_vectorize = bool(
        pass_ctx.config.get("tir.disable_vectorize", False))
    add_lower_pass = pass_ctx.config.get("tir.add_lower_pass", [])

    lower_phase0 = [x[1] for x in add_lower_pass if x[0] == 0]
    lower_phase1 = [x[1] for x in add_lower_pass if x[0] == 1]
    lower_phase2 = [x[1] for x in add_lower_pass if x[0] == 2]
    lower_phase3 = [x[1] for x in add_lower_pass if x[0] > 2]

    # Phase 0
    pass_list = lower_phase0
    is_legacy_te_schedule: bool = False

    if isinstance(inputs, schedule.Schedule):
        if args is None:
            raise ValueError(
                "args must be given for lowering from TE schedule")
        mod = form_irmodule(inputs, args, name, binds)
        is_legacy_te_schedule = True
    elif isinstance(inputs, PrimFunc):
        func = inputs.with_attr("global_symbol", name)
        if pass_ctx.config.get("tir.noalias", True):
            func = func.with_attr("tir.noalias", True)
        mod = tvm.IRModule({name: func})
    elif isinstance(inputs, IRModule):
        mod = inputs
    else:
        raise TypeError(
            f"tvm.lower expected te.Schedule, PrimFunc or IRModule, but got {type(inputs)}"
        )

    # Phase 1
    if is_legacy_te_schedule:
        pass_list += [
            tvm.tir.transform.InjectPrefetch(),
            tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers),
        ]
    else:
        pass_list += [
            tvm.tir.transform.LowerInitBlock(),
            tvm.tir.transform.PlanAndUpdateBufferAllocationLocation(),
            tvm.tir.transform.ConvertBlocksToOpaque(),
            tvm.tir.transform.CompactBufferAllocation(),
            tvm.tir.transform.FlattenBuffer(),
        ]
    pass_list += [
        tvm.tir.transform.BF16Legalize(),
        tvm.tir.transform.NarrowDataType(32),
        tvm.tir.transform.Simplify(),
    ]

    pass_list += lower_phase1

    # Phase 2
    if not simple_mode:
        pass_list += [(tvm.tir.transform.LoopPartition())]

    pass_list += [
        tvm.tir.transform.VectorizeLoop(not disable_vectorize),
        tvm.tir.transform.InjectVirtualThread(),
        tvm.tir.transform.InjectDoubleBuffer(),
        tvm.tir.transform.StorageRewrite(),
        tvm.tir.transform.UnrollLoop(),
    ]
    pass_list += lower_phase2

    # Phase 3
    pass_list += [
        tvm.tir.transform.Simplify(),
        tvm.tir.transform.RemoveNoOp(),
    ]

    pass_list += [tvm.tir.transform.RewriteUnsafeSelect()]
    pass_list += [tvm.tir.transform.HoistIfThenElse()]
    pass_list += lower_phase3

    # Instrument BoundCheckers
    if instrument_bound_checkers:
        pass_list += [tvm.tir.transform.InstrumentBoundCheckers()]

    optimize = tvm.transform.Sequential(pass_list)
    mod = optimize(mod)
    return mod
示例#20
0
    def build(self,
              mod,
              target=None,
              target_host=None,
              params=None,
              executor="graph",
              mod_name=None):
        """
        Parameters
        ----------
        mod : :py:class:`~tvm.IRModule`
            The IRModule to build.

        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
        device/context name) to str/tvm.target.Target, optional
            For heterogeneous compilation, it is a dictionary indicating context
            to target mapping. For homogeneous compilation, it is a build target.

        target_host : str or :any:`tvm.target.Target`, optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            to setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm intepreter is used.

        params : dict of str to NDArray
            Input parameters to the graph that do not change
            during inference time. Used for constant folding.

        executor: str[Optional]
            The type of executor to be used in order to run the model:
            - If "graph" is specified, then the graph_executor will be used
            - If "aot" is specified, then the aot_executor will be used

        mod_name: Optional[str]
            The module name we will build

        Returns
        -------
        graph_json : str
            The json string that can be accepted by graph executor.

        mod : tvm.Module
            The module containing necessary libraries.

        params : dict
            The parameters of the final graph.
        """
        target = build_target_by_device_type_map(target)
        target, target_host = Target.check_and_update_host_consist(
            target, target_host, target_is_dict_key=False)

        # Setup the params.
        if params:
            self._set_params(params)

        # Build the IR module. If auto_scheduler is not enabled,
        # then use the TOPI-defined schedule.
        use_auto_scheduler = PassContext.current().config.get(
            "relay.backend.use_auto_scheduler", False)

        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler

        mod_name = mangle_module_name(mod_name)

        self._build(mod, target, target_host, executor, mod_name)
        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent

        # Get artifacts
        mod = self.get_module()
        params = self.get_params()
        executor_config = self.get_graph_json(
        ) if executor == "graph" else None

        return executor_config, mod, params
示例#21
0
def select_implementation(op,
                          attrs,
                          inputs,
                          out_type,
                          target,
                          use_autotvm=True):
    """Select the best implementation from the op strategy.

    If use_autotvm is True, it'll first try to find the best implementation
    based on AutoTVM profile results. If no AutoTVM profile result is found,
    it'll choose the implementation with highest plevel.

    If use_autotvm is False, it'll directly choose the implementation with
    highest plevel.

    Note that this function doesn't support op with symbolic input shapes.

    Parameters
    ----------
    op : tvm.ir.Op
        Relay operator.

    attrs : object
        The op attribute.

    inputs : List[tvm.te.Tensor]
        Input tensors to the op.

    out_type : relay.Type
        The output type.

    target : tvm.target.Target
        The target to compile the op.

    use_autotvm : bool
        Whether query AutoTVM to pick the best.

    Returns
    -------
    ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
        The best op implementation and the corresponding output tensors.
    """
    all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
    best_plevel_impl = max(all_impls, key=lambda x: x.plevel)

    # Disable autotvm if auto_scheduler is enabled.
    # (i.e., always return the implementation with the highest priority for auto-scheduler).
    if PassContext.current().config.get("relay.backend.use_auto_scheduler",
                                        False):
        use_autotvm = False

    # If not use autotvm, always return the implementation with the highest priority
    if not use_autotvm:
        logger.info(
            "Using %s for %s based on highest priority (%d)",
            best_plevel_impl.name,
            op.name,
            best_plevel_impl.plevel,
        )
        outs = best_plevel_impl.compute(attrs, inputs, out_type)
        return best_plevel_impl, outs

    # Otherwise, try autotvm templates
    outputs = {}
    workloads = {}
    best_autotvm_impl = None
    best_cfg = None
    dispatch_ctx = autotvm.task.DispatchContext.current
    old_silent = autotvm.GLOBAL_SCOPE.silent
    autotvm.GLOBAL_SCOPE.silent = True
    for impl in all_impls:
        outs = impl.compute(attrs, inputs, out_type)
        outputs[impl] = outs
        workload = autotvm.task.get_workload(outs)
        workloads[impl] = workload
        if workload is None:
            # Not an AutoTVM tunable implementation
            continue
        cfg = dispatch_ctx.query(target, workload)
        if cfg.is_fallback:
            # Skip fallback config
            continue
        logger.info("Implementation %s for %s has cost %.2e", impl.name,
                    op.name, cfg.cost)
        if best_cfg is None or best_cfg.cost > cfg.cost:
            best_autotvm_impl = impl
            best_cfg = cfg
    autotvm.GLOBAL_SCOPE.silent = old_silent

    if best_autotvm_impl:
        # The best autotvm implementation definitely doesn't use fallback config
        logger.info(
            "Using %s for %s based on lowest cost (%.2e)",
            best_autotvm_impl.name,
            op.name,
            best_cfg.cost,
        )
        return best_autotvm_impl, outputs[best_autotvm_impl]

    # Use the implementation with highest plevel
    if workloads[best_plevel_impl] is not None:
        msg = (
            "Cannot find tuning records for:\n    target=%s\n    key=%s\n"
            "TVM will apply a default schedule which may negatively impact performance."
            % (target, workloads[best_plevel_impl]))
        if (not autotvm.env.GLOBAL_SCOPE.silent
                and msg not in autotvm.task.DispatchContext.warning_messages):
            autotvm.task.DispatchContext.warning_messages.add(msg)
            global _first_warning
            if _first_warning:
                _first_warning = False
                info_msg = (
                    "One or more operators have not been tuned. Please tune your model "
                    "for better performance. Use DEBUG logging level to see more details."
                )
                autotvm_logger.warning(info_msg)
            autotvm_logger.debug(msg)

    logger.info(
        "Using %s for %s based on highest priority (%s)",
        best_plevel_impl.name,
        op.name,
        best_plevel_impl.plevel,
    )
    return best_plevel_impl, outputs[best_plevel_impl]
示例#22
0
    def build(
        self,
        mod,
        target=None,
        target_host=None,
        executor=Executor("graph"),
        runtime=Runtime("cpp"),
        workspace_memory_pools=None,
        params=None,
        mod_name=None,
    ):
        """
        Parameters
        ----------
        mod : :py:class:`~tvm.IRModule`
            The IRModule to build.

        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
        device/context name) to str/tvm.target.Target, optional
            For heterogeneous compilation, it is a dictionary indicating context
            to target mapping. For homogeneous compilation, it is a build target.

        target_host : str or :any:`tvm.target.Target`, optional
            Host compilation target, if target is device.
            When TVM compiles device specific program such as CUDA,
            we also need host(CPU) side code to interact with the driver
            to setup the dimensions and parameters correctly.
            target_host is used to specify the host side codegen target.
            By default, llvm is used if it is enabled,
            otherwise a stackvm interpreter is used.

        executor : Optional[Executor]
            The executor configuration with which to build the model.
            Defaults to "graph" if no executor specified.

        runtime : Optional[Runtime]
            Runtime configuration to use when building the model.
            Defaults to "cpp" if no runtime specified.

        workspace_memory_pools : Optional[WorkspaceMemoryPools]
            The object that contains an Array of PoolInfo objects
            that hold properties of workspace pools that could be
            used by the inference.

        params : dict of str to NDArray
            Input parameters to the graph that do not change
            during inference time. Used for constant folding.

        mod_name: Optional[str]
            The module name we will build

        Returns
        -------
        graph_json : str
            The json string that can be accepted by graph executor.

        mod : tvm.Module
            The module containing necessary libraries.

        params : dict
            The parameters of the final graph.
        """
        if target_host is not None:
            warnings.warn(
                "target_host parameter is going to be deprecated. "
                "Please pass in tvm.target.Target(target, host=target_host) instead."
            )
        target = build_target_by_device_type_map(target)
        target, target_host = Target.check_and_update_host_consist(
            target, target_host, target_is_dict_key=False)

        # Setup the params.
        if params:
            self._set_params(params)

        # Build the IR module. If auto_scheduler is not enabled,
        # then use the TOPI-defined schedule.
        use_auto_scheduler = PassContext.current().config.get(
            "relay.backend.use_auto_scheduler", False)

        # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
        old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler or old_autotvm_silent

        mod_name = mangle_module_name(mod_name)

        self._build(mod, target, target_host, executor, runtime,
                    workspace_memory_pools, mod_name)
        autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent

        # Get artifacts
        mod = self.get_module()
        params = self.get_params()
        executor_config = self.get_graph_json() if str(
            executor) == "graph" else None

        return executor_config, mod, params
示例#23
0
def partition_for_cutlass(mod, params=None):
    """Partition the input module into CUTLASS-supported subgraphs."""
    dense_pat = ("cutlass.dense", make_gemm_pattern(False, None), check_gemm)
    dense_bias_pat = ("cutlass.dense_bias", make_gemm_pattern(True, None),
                      check_gemm)
    dense_bias_relu_pat = ("cutlass.dense_bias_relu",
                           make_gemm_pattern(True, "relu"), check_gemm)
    dense_bias_gelu_fp16_pat = (
        "cutlass.dense_bias_gelu_fp16",
        make_gemm_pattern(True, "gelu"),
        check_gemm,
    )
    dense_bias_gelu_fp32_pat = (
        "cutlass.dense_bias_gelu_fp32",
        make_gemm_pattern(True, "gelu", out_dtype="float32"),
        check_gemm,
    )

    dense_patterns = [
        dense_bias_gelu_fp16_pat,
        dense_bias_gelu_fp32_pat,
        dense_bias_relu_pat,
        dense_bias_pat,
        dense_pat,
        ("cutlass.batch_matmul", make_batch_matmul_pattern(),
         check_batch_matmul),
    ]

    conv2d_patterns = [
        (
            "cutlass.conv2d_bias_hardswish",
            make_conv2d_pattern(with_bias=True, with_act="hardswish"),
            check_conv2d,
        ),
        (
            "cutlass.conv2d_bias_silu",
            make_conv2d_pattern(with_bias=True, with_act="silu"),
            check_conv2d,
        ),
        (
            "cutlass.conv2d_bias_relu",
            make_conv2d_pattern(with_bias=True, with_act="relu"),
            check_conv2d,
        ),
        (
            "cutlass.conv2d_bias_sigmoid",
            make_conv2d_pattern(with_bias=True, with_act="sigmoid"),
            check_conv2d,
        ),
        ("cutlass.conv2d_bias", make_conv2d_pattern(with_bias=True),
         check_conv2d),
        ("cutlass.conv2d", make_conv2d_pattern(), check_conv2d),
    ]

    residual_block_patterns = []

    for with_act, postfix in [("relu", "_relu"), (None, "")]:
        for name, pat, _ in conv2d_patterns[:-1]:
            for bin_op in ["add", "multiply"]:
                residual_block_patterns.append((
                    name + "_residual_" + bin_op + postfix,
                    make_residual_block_pattern(pat, bin_op,
                                                with_act=with_act),
                    partial(check_conv2d_residual, binary_op=bin_op),
                ))

    cutlass_patterns = residual_block_patterns + dense_patterns + conv2d_patterns

    if params is not None:
        mod["main"] = bind_params_by_name(mod["main"], params)
        remove_bn_pass = Sequential([
            transform.InferType(),
            transform.SimplifyInference(),
            transform.FoldConstant(),
            transform.FoldScaleAxis(),
        ])
        with PassContext(opt_level=3):
            mod = remove_bn_pass(mod)

    seq = Sequential([
        transform.InferType(),
        transform.MergeComposite(cutlass_patterns),
        transform.AnnotateTarget(["cutlass"], include_non_call_ops=False),
        transform.PartitionGraph(bind_constants=False),
    ])

    return seq(mod)
示例#24
0
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8",
                                                                    "uint8"):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda",
                )
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if ((2 < kh < 8 and 2 < kw < 8 and kh == kw)
                    and (stride_h == 1 and stride_w == 1)
                    and (dilation_h == 1 and dilation_w == 1)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5,
                )
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                name="conv2d_nhwc.cuda",
            )

            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)
            # Winograd shape related judgment
            (
                judge_winograd_tensorcore,
                judge_winograd_autotvm,
                judge_winograd_auto_scheduler,
            ) = judge_winograd(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                data.dtype,
                kernel.dtype,
                pre_flag=False,
            )
            if judge_winograd_autotvm:
                if (target.kind.name == "cuda"
                        and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                        and judge_winograd_tensorcore):
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_tensorcore),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore
                        ),
                        name="conv2d_nhwc_winograd_tensorcore.cuda",
                        plevel=5,
                    )
                else:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_direct),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                        name="conv2d_nhwc_winograd_direct.cuda",
                        plevel=5,
                    )
            if (target.kind.name == "cuda"
                    and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                    and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or
                         (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or
                         (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nhwc_tensorcore),
                    name="conv2d_nhwc_tensorcore.cuda",
                    plevel=20,
                )

            # register auto-scheduler implementations
            use_auto_scheduler = PassContext.current().config.get(
                "relay.backend.use_auto_scheduler", False)
            if use_auto_scheduler and judge_winograd_auto_scheduler:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                    naive_schedule,  # this implementation should never be picked by autotvm
                    name="conv2d_nhwc.winograd",
                    plevel=15,
                )

        elif layout == "HWNC":
            assert kernel_layout in [
                "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"
            ]
            _, _, N, in_channels = get_const_tuple(data.shape)
            pre_computed = len(kernel.shape) == 6
            if pre_computed:
                _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(
                    kernel.shape)
                out_channels = oc_chunk * oc_block_factor
            else:
                _, _, out_channels, _ = get_const_tuple(kernel.shape)

            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
            if ((N % 16 == 0 and in_channels % 16 == 0
                 and out_channels % 16 == 0)
                    or (N % 8 == 0 and in_channels % 16 == 0
                        and out_channels % 32 == 0)
                    or (N % 32 == 0 and in_channels % 16 == 0
                        and out_channels % 8 == 0) and
                (data.dtype in tensorcore_dtypes
                 and kernel.dtype in tensorcore_dtypes)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_hwnc_tensorcore),
                    name="conv2d_hwnc_tensorcore_direct.cuda",
                    plevel=20,
                )
            else:
                raise RuntimeError("Unsupported shape for conv2d HWNC.\
                                    Need to satisfy tensor core schedule.")
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))
        # add cudnn implementation
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in [
                    "NCHW", "NHWC"
            ] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        # add cudnn implementation, if any
        cudnn_impl = False
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in [
                    "NCHW", "NHWC"
            ] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
                cudnn_impl = True

        if layout == "NCHW":
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.cuda",
            )
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda",
            )
        elif not cudnn_impl:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy
示例#25
0
def partition_for_cutlass(mod, params=None):
    """Partition the input module into CUTLASS-supported subgraphs."""
    dense_pat = ("cutlass.dense", make_gemm_pattern(False, None), check_gemm)
    dense_bias_pat = ("cutlass.dense_bias", make_gemm_pattern(True, None),
                      check_gemm)
    dense_bias_relu_pat = ("cutlass.dense_bias_relu",
                           make_gemm_pattern(True, "relu"), check_gemm)
    dense_bias_gelu_fp16_pat = (
        "cutlass.dense_bias_gelu_fp16",
        make_gemm_pattern(True, "gelu"),
        check_gemm,
    )
    dense_bias_gelu_fp32_pat = (
        "cutlass.dense_bias_gelu_fp32",
        make_gemm_pattern(True, "gelu", out_dtype="float32"),
        check_gemm,
    )
    cutlass_patterns = [
        dense_bias_gelu_fp16_pat,
        dense_bias_gelu_fp32_pat,
        dense_bias_relu_pat,
        dense_bias_pat,
        dense_pat,
        ("cutlass.batch_matmul", make_batch_matmul_pattern(),
         check_batch_matmul),
        (
            "cutlass.conv2d_bias_hardswish",
            make_conv2d_pattern(with_bias=True, with_act="hardswish"),
            check_conv2d,
        ),
        (
            "cutlass.conv2d_bias_silu",
            make_conv2d_pattern(with_bias=True, with_act="silu"),
            check_conv2d,
        ),
        (
            "cutlass.conv2d_bias_relu",
            make_conv2d_pattern(with_bias=True, with_act="relu"),
            check_conv2d,
        ),
        (
            "cutlass.conv2d_bias_sigmoid",
            make_conv2d_pattern(with_bias=True, with_act="sigmoid"),
            check_conv2d,
        ),
        ("cutlass.conv2d_bias", make_conv2d_pattern(with_bias=True),
         check_conv2d),
        ("cutlass.conv2d", make_conv2d_pattern(), check_conv2d),
    ]

    if params is not None:
        mod["main"] = bind_params_by_name(mod["main"], params)
        remove_bn_pass = Sequential([
            transform.InferType(),
            transform.SimplifyInference(),
            transform.FoldConstant(),
            transform.FoldScaleAxis(),
        ])
        with PassContext(opt_level=3):
            mod = remove_bn_pass(mod)

    seq = Sequential([
        transform.InferType(),
        transform.MergeComposite(cutlass_patterns),
        transform.AnnotateTarget(["cutlass"], include_non_call_ops=False),
        transform.PartitionGraph(bind_constants=False),
    ])

    return seq(mod)