def batch_matmul_strategy_cpu(attrs, inputs, out_type, target): """batch_matmul x86 strategy""" strategy = _op.OpStrategy() mcpu = Target.current().mcpu need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if (not attrs.transpose_a and attrs.transpose_b and target_has_vnni(mcpu) and inputs[0].dtype == "uint8" and inputs[1].dtype == "int8" and inputs[1].shape[-2] % 16 == 0 and inputs[1].shape[-1] % 4 == 0): strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul_vnni_compute, need_out_dtype=True), wrap_topi_schedule(topi.x86.schedule_batch_matmul_vnni), name="batch_matmul_vnni.x86", plevel=10, ) elif is_dynamic( out_type ) or need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_batch_matmul( topi.nn.batch_matmul, need_out_dtype=True, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), wrap_topi_schedule(topi.generic.nn.schedule_batch_matmul), name="batch_matmul.generic", plevel=10, ) else: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul, need_out_dtype=True), wrap_topi_schedule(topi.x86.schedule_batch_matmul), name="batch_matmul.x86", plevel=10, ) if "cblas" in target.libs: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas), wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas), name="batch_matmul_cblas.x86", plevel=15, ) if "mkl" in target.libs: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul_mkl), wrap_topi_schedule(topi.x86.schedule_batch_matmul_mkl), name="batch_matmul_mkl.x86", plevel=15, ) return strategy
def conv3d_strategy_cpu(attrs, inputs, out_type, target): """conv3d generic strategy""" strategy = _op.OpStrategy() layout = attrs.data_layout need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: # Use auto-scheduler. We should provide clear compute definition without autotvm templates # or packed layouts. if layout == "NCDHW": strategy.add_implementation( wrap_compute_conv3d(topi.nn.conv3d_ncdhw), naive_schedule, name="conv3d_ncdhw.x86", ) elif layout == "NDHWC": strategy.add_implementation( wrap_compute_conv3d( topi.nn.conv3d_ndhwc, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, name="conv3d_ndhwc.x86", ) else: raise ValueError("Not support this layout {} yet".format(layout)) else: # Use autotvm templates if layout == "NCDHW": strategy.add_implementation( wrap_compute_conv3d(topi.x86.conv3d_ncdhw), wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw), name="conv3d_ncdhw.x86", ) elif layout == "NDHWC": strategy.add_implementation( wrap_compute_conv3d(topi.x86.conv3d_ndhwc), wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc), name="conv3d_ndhwc.x86", ) else: raise ValueError("Not support this layout {} yet".format(layout)) return strategy
def conv2d_winograd_without_weight_transfrom_strategy_cpu( attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cpu strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout strides = attrs.get_int_tuple("strides") assert dilation == (1, 1), "Do not support dilate now" assert strides == (1, 1), "Do not support strides now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if layout == "NHWC": if need_meta_schedule_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform, need_auto_scheduler_layout=False, need_meta_schedule_layout=True, ), naive_schedule, name="ansor.winograd", ) elif need_auto_scheduler_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform, need_auto_scheduler_layout=True, need_meta_schedule_layout=False, ), naive_schedule, name="ansor.winograd", ) else: raise RuntimeError( "Both AutoScheduler and MetaSchedule are not enabled") else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) return strategy
def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom mali strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout strides = attrs.get_int_tuple("strides") kernel = inputs[1] assert dilation == (1, 1), "Do not support dilate now" assert strides == (1, 1), "Do not support strides now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": assert len(kernel.shape) == 5, "Kernel must be packed into 5-dim" strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.mali", ) elif layout == "NHWC": need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc_winograd_without_weight_transform", plevel=15, ) else: raise RuntimeError( "Winograd conv2d NHWC is not enabled for mali without auto_scheduler." ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout) ) return strategy
def dense_strategy_mali(attrs, inputs, out_type, target): """dense mali strategy""" strategy = _op.OpStrategy() if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True), naive_schedule, name="dense.mali", ) elif is_meta_schedule_enabled(): strategy.add_implementation( wrap_compute_dense(topi.nn.dense, need_meta_schedule_layout=True), naive_schedule, name="dense.mali", ) else: strategy.add_implementation( wrap_compute_dense(topi.mali.dense), wrap_topi_schedule(topi.mali.schedule_dense), name="dense.mali", ) return strategy
def conv2d_strategy_mali(attrs, inputs, out_type, target): """conv2d mali strategy""" strategy = _op.OpStrategy() data, kernel = inputs dilation_h, dilation_w = attrs.get_int_tuple("dilation") stride_h, stride_w = attrs.get_int_tuple("strides") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": if kernel_layout == "OIHW": strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), name="conv2d_nchw_spatial_pack.mali", ) # check if winograd algorithm is applicable _, _, kh, kw = get_const_tuple(kernel.shape) if ( kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1 ): strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.mali", plevel=5, ) elif re.match(r"OIHW\d*o", kernel_layout): strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), name="conv2d_nchw_spatial_pack.mali", ) else: raise RuntimeError( "Unsupported weight layout {} for conv2d NCHW".format(kernel_layout) ) elif layout == "NHWC": assert kernel_layout == "HWIO" need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_nhwc, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, name="conv2d_nhwc.mali", ) is_winograd_applicable = False if len(kernel.shape) == 4: kernel_h, kernel_w, _, _ = get_const_tuple(kernel.shape) is_winograd_applicable = ( "float" in data.dtype and "float" in kernel.dtype and kernel_h == 3 and kernel_w == 3 and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1 ) if is_winograd_applicable: if need_meta_schedule_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=False, need_meta_schedule_layout=True, ), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif need_auto_scheduler_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True, need_meta_schedule_layout=False, ), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) else: raise RuntimeError("Both AutoScheduler and MetaSchedule are not enabled") else: strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nhwc_spatial_pack), wrap_topi_schedule(topi.mali.schedule_conv2d_nhwc_spatial_pack), name="conv2d_nhwc_spatial_pack.mali", ) else: raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.mali.depthwise_conv2d_nchw), wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.mali", ) elif layout == "NHWC": assert kernel_layout == "HWOI" if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), naive_schedule, name="depthwise_conv2d_nhwc.mali", ) elif is_meta_schedule_enabled(): strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), naive_schedule, name="depthwise_conv2d_nhwc.mali", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.mali", ) else: raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout)) else: # group_conv2d raise RuntimeError("group_conv2d is not supported for mali") return strategy
def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[ 1].dtype == "int8" and out_type.dtype == "int32" strategy.add_implementation( wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", plevel=5, ) strategy.add_implementation( wrap_compute_dense(topi.x86.dense_pack), wrap_topi_schedule(topi.x86.schedule_dense_pack), name="dense_pack.x86", plevel=10, ) need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_dense( topi.nn.dense, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, name="dense.generic", plevel=11, ) if "cblas" in target.libs: with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=13, ) if "mkl" in target.libs: with SpecializedCondition( same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_mkl), wrap_topi_schedule(topi.x86.schedule_dense_mkl), name="dense_mkl.x86", plevel=14, ) if "dnnl" in target.libs: with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_dnnl), wrap_topi_schedule(topi.x86.schedule_dense_dnnl), name="dense_dnnl.x86", plevel=15, ) return strategy
def matmul_strategy_cpu(attrs, inputs, out_type, target): """matmul x86 strategy""" strategy = _op.OpStrategy() same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[ 1].dtype == "int8" and out_type.dtype == "int32" if "cblas" in target.libs: length_before = len( strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_cblas), wrap_topi_schedule(topi.x86.schedule_matmul_cblas), name="matmul_cblas.x86", plevel=13, ) length_after = len( strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning( "Currently cblas only support the data type to be float32 or float64. Skip." ) if "mkl" in target.libs: length_before = len( strategy.specializations) if strategy.specializations else 0 with SpecializedCondition( same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkl), wrap_topi_schedule(topi.x86.schedule_matmul_mkl), name="matmul_mkl.x86", plevel=14, ) length_after = len( strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning( "Currently mkl only support the data type to be float32, float64 or input with " "uint8 and int8 while output wiht int32. Skip.") if "dnnl" in target.libs: length_before = len( strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_dnnl), wrap_topi_schedule(topi.x86.schedule_matmul_dnnl), name="matmul_dnnl.x86", plevel=15, ) length_after = len( strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning( "Currently dnnl only support the data type to be float32. Skip." ) need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_matmul( topi.nn.matmul, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, name="matmul.generic", plevel=11, ) else: # If no cblas/mkl/dnnl strategy choosed if not strategy.specializations: logger.warning( "Matmul is not optimized for x86. " "Recommend to use cblas/mkl/dnnl for better performance.") strategy.add_implementation( wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.generic", ) return strategy
def conv2d_strategy_cpu(attrs, inputs, out_type, target): """conv2d x86 strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = get_const_tuple(attrs.strides) dilation_h, dilation_w = get_const_tuple(attrs.dilation) groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw_int8), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.x86", ) elif "dnnl" in target.libs: strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw_dnnl), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_dnnl), name="conv2d_nchw_dnnl.x86", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw), name="conv2d_nchw.x86", ) elif _NCHWc_matcher.match(layout): # check if layout is NCHWxc assert _OIHWio_matcher.match( kernel_layout) # check if kernel is OIHWio return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target) elif layout == "NHWC": assert kernel_layout == "HWIO" if (not need_auto_scheduler_layout) and ( not need_meta_schedule_layout): logger.warning( "conv2d NHWC layout is not optimized for x86 with autotvm." ) if "dnnl" in target.libs: strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nhwc_dnnl), wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_dnnl), name="conv2d_nhwc_dnnl.x86", ) else: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_nhwc, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), name="conv2d_nhwc.x86", ) judge_winograd_auto_scheduler = False if len(kernel.shape) == 4: kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape) judge_winograd_auto_scheduler = ( "float" in data.dtype and "float" in kernel.dtype and kernel_h == 3 and kernel_w == 3 and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1 and 64 < co < 512 # The last condition of co is based on our profiling of resnet workloads # on skylake avx512 machines. We found winograd is faster than direct # only when co is within this range ) # register auto-scheduler implementations if (need_auto_scheduler_layout or need_meta_schedule_layout ) and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif layout == "HWCN": assert kernel_layout == "HWIO" if (not need_auto_scheduler_layout) or ( not need_meta_schedule_layout): logger.warning( "conv2d HWCN layout is not optimized for x86 with autotvm." ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_hwcn), wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), name="conv2d_hwcn.generic", ) else: raise RuntimeError( "Unsupported conv2d layout {} for x86".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" channel_multiplier = get_const_tuple(inputs[1].shape)[1] if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1: strategy.add_implementation( wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw), wrap_topi_schedule( topi.x86.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.x86", ) else: logger.warning("For x86 target, depthwise_conv2d with channel " "multiplier greater than 1 is not optimized") strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), wrap_topi_schedule( topi.generic.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.generic", ) elif _NCHWc_matcher.match(layout): # check if layout is NCHWxc assert _OIHWio_matcher.match( kernel_layout) # check if kernel is OIHWio return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target) elif layout == "NHWC": assert kernel_layout == "HWOI" if (not need_auto_scheduler_layout) and ( not need_meta_schedule_layout): logger.warning( "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm." ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule( topi.generic.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.generic", ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.x86.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.x86.schedule_group_conv2d_nchw), name="group_conv2d_nchw.x86", ) elif layout == "NHWC": assert kernel_layout == "HWIO" if (not need_auto_scheduler_layout) and ( not need_meta_schedule_layout): logger.warning( "group_conv2d is not optimized for x86 with autotvm.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), name="group_conv2d_nhwc.generic", ) else: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy