def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() m, _ = inputs[0].shape same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[ 1].dtype == "int8" and out_type.dtype == "int32" strategy.add_implementation( wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", plevel=10, ) if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True), naive_schedule, name="dense.generic", plevel=11, ) if "cblas" in target.libs: with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=13, ) if "mkl" in target.libs: with SpecializedCondition( same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_mkl), wrap_topi_schedule(topi.x86.schedule_dense_mkl), name="dense_mkl.x86", plevel=14, ) if "mkldnn" in target.libs: with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_mkldnn), wrap_topi_schedule(topi.x86.schedule_dense_mkldnn), name="dense_mkldnn.x86", plevel=15, ) with SpecializedCondition(m >= 16): # this implementation may not be well-optimized, so use plevel=5 for now. strategy.add_implementation( wrap_compute_dense(topi.x86.dense_pack), wrap_topi_schedule(topi.x86.schedule_dense_pack), name="dense_pack.x86", plevel=5, ) return strategy
def dense_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() if out_type.dtype == "int8": strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_int8), wrap_topi_schedule(topi.cuda.schedule_dense_int8), name="dense_int8.cuda") else: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_small_batch), wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), name="dense_small_batch.cuda") b = inputs[0].shape[0] with SpecializedCondition(b >= 32): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_large_batch), wrap_topi_schedule(topi.cuda.schedule_dense_large_batch), name="dense_large_batch.cuda", plevel=15) if target.target_name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_cublas), wrap_topi_schedule(topi.cuda.schedule_dense_cublas), name="dense_cublas.cuda", plevel=20) return strategy
def dense_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() data, weights = inputs b, i = get_const_tuple(data.shape) o, _ = get_const_tuple(weights.shape) if (target.kind.name in ["cuda", "vulkan", "rocm"] and data.dtype == "int8" and weights.dtype == "int8" and out_type.dtype == "int32"): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_int8), wrap_topi_schedule(topi.cuda.schedule_dense_int8), name="dense_int8.cuda", ) else: strategy.add_implementation( wrap_compute_dense(topi.gpu.dense_small_batch), wrap_topi_schedule(topi.gpu.schedule_dense_small_batch), name="dense_small_batch.gpu", ) with SpecializedCondition(b >= 32): strategy.add_implementation( wrap_compute_dense(topi.gpu.dense_large_batch), wrap_topi_schedule(topi.gpu.schedule_dense_large_batch), name="dense_large_batch.gpu", plevel=5, ) if target.kind.name == "cuda": if nvcc.have_tensorcore(target=target): if ((data.dtype in ["float16", "int8", "uint8"] and ((i % 16 == 0 and b % 16 == 0 and o % 16 == 0) or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0))) or (data.dtype in ["int4", "uint4"] and i % 32 == 0 and b % 8 == 0 and o % 8 == 0) or (data.dtype in ["int1", "uint1"] and i % 128 == 0 and b % 8 == 0 and o % 8 == 0)): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_tensorcore), wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore), name="dense_tensorcore.cuda", plevel=20, ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_cublas), wrap_topi_schedule(topi.cuda.schedule_dense_cublas), name="dense_cublas.cuda", plevel=25, ) return strategy
def dense_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() data, weights = inputs b, i = get_const_tuple(data.shape) o, _ = get_const_tuple(weights.shape) if out_type.dtype == "int8": strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_int8), wrap_topi_schedule(topi.cuda.schedule_dense_int8), name="dense_int8.cuda", ) else: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_small_batch), wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), name="dense_small_batch.cuda", ) strategy.add_auto_scheduler( wrap_compute_dense(topi.nn.dense), name="dense", ) with SpecializedCondition(b >= 32): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_large_batch), wrap_topi_schedule(topi.cuda.schedule_dense_large_batch), name="dense_large_batch.cuda", plevel=5, ) if target.kind.name == "cuda": if nvcc.have_tensorcore(tvm.gpu(0).compute_version): if ( (i % 16 == 0 and b % 16 == 0 and o % 16 == 0) or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0) ): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_tensorcore), wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore), name="dense_tensorcore.cuda", plevel=20, ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_cublas), wrap_topi_schedule(topi.cuda.schedule_dense_cublas), name="dense_cublas.cuda", plevel=25, ) return strategy
def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() m, _ = inputs[0].shape strategy.add_implementation(wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", plevel=10) if "cblas" in target.libs: strategy.add_implementation(wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=15) with SpecializedCondition(m >= 16): # this implementation may not be well-optimized, so use plevel=8 for now. strategy.add_implementation(wrap_compute_dense(topi.x86.dense_pack), wrap_topi_schedule(topi.x86.schedule_dense_pack), name="dense_pack.x86", plevel=5) return strategy
def scatter_cuda(attrs, inputs, out_type, target): """scatter cuda strategy""" strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_scatter(topi.cuda.scatter), wrap_topi_schedule(topi.cuda.schedule_scatter), name="scatter.cuda", plevel=10, ) rank = len(inputs[0].shape) with SpecializedCondition(rank == 1): if can_use_thrust(target, "tvm.contrib.thrust.stable_sort_by_key"): strategy.add_implementation( wrap_compute_scatter(topi.cuda.scatter_via_sort), wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort), name="scatter_via_sort.cuda", plevel=9, # use the sequential version by default ) return strategy
def matmul_strategy_cpu(attrs, inputs, out_type, target): """matmul x86 strategy""" strategy = _op.OpStrategy() same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32" if "cblas" in target.libs: length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_cblas), wrap_topi_schedule(topi.x86.schedule_matmul_cblas), name="matmul_cblas.x86", plevel=13, ) length_after = len(strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning( "Currently cblas only support the data type to be float32 or float64. Skip." ) if "mkl" in target.libs: length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkl), wrap_topi_schedule(topi.x86.schedule_matmul_mkl), name="matmul_mkl.x86", plevel=14, ) length_after = len(strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning( "Currently mkl only support the data type to be float32, float64 or input with " "uint8 and int8 while output wiht int32. Skip." ) if "mkldnn" in target.libs: length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkldnn), wrap_topi_schedule(topi.x86.schedule_matmul_mkldnn), name="matmul_mkldnn.x86", plevel=15, ) length_after = len(strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning("Currently mkldnn only support the data type to be float32. Skip.") if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True), naive_schedule, name="matmul.generic", plevel=11, ) else: # If no cblas/mkl/mkldnn strategy choosed if not strategy.specializations: logger.warning( "Matmul is not optimized for x86. " "Recommend to use cblas/mkl/mkldnn for better performance." ) strategy.add_implementation( wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.generic", ) return strategy
def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[ 1].dtype == "int8" and out_type.dtype == "int32" strategy.add_implementation( wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", plevel=5, ) strategy.add_implementation( wrap_compute_dense(topi.x86.dense_pack), wrap_topi_schedule(topi.x86.schedule_dense_pack), name="dense_pack.x86", plevel=10, ) need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_dense( topi.nn.dense, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, name="dense.generic", plevel=11, ) if "cblas" in target.libs: with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=13, ) if "mkl" in target.libs: with SpecializedCondition( same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_mkl), wrap_topi_schedule(topi.x86.schedule_dense_mkl), name="dense_mkl.x86", plevel=14, ) if "dnnl" in target.libs: with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_dnnl), wrap_topi_schedule(topi.x86.schedule_dense_dnnl), name="dense_dnnl.x86", plevel=15, ) return strategy