def gpu_schedule_Mean(outs): """ gpu schedule function for mean. Args: outs (tvm.tensor.Tensor): outputs of compute. Returns: sch (schedule.Schedule): The created schedule. """ out = outs[0] if isinstance(outs, list) else outs device = "cuda" with tvm.target.create(device): sch = tvm.create_schedule(out.op) if out.op.name == "T_divide": tensor_c = out else: # squeeze tensor_c = out.op.input_tensors[0] tensor_b = tensor_c.op.input_tensors[0] if len(tensor_c.op.axis) >= 2: sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[1]) else: sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[0]) bx, tx = sch[tensor_c].split(tensor_c.op.axis[0], factor=DEFAULT_GPU_THREAD) sch[tensor_c].bind(bx, tvm.thread_axis("blockIdx.x")) sch[tensor_c].bind(tx, tvm.thread_axis("threadIdx.x")) return sch
def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m, ), name="A", dtype=dtype) B = tvm.placeholder((m, ), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def gpu_schedule_MeanGrad(outs): """gpu schedule MeanGrad.""" out = outs[0] if isinstance(outs, list) else outs device = "cuda" with tvm.target.create(device): sch = tvm.create_schedule(out.op) tensor_c = out tensor_b = tensor_c.op.input_tensors[0] if len(tensor_c.op.axis) >= 2: sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[1]) else: sch[tensor_b].compute_at(sch[tensor_c], tensor_c.op.axis[0]) bx, tx = sch[tensor_c].split(tensor_c.op.axis[0], factor=DEFAULT_GPU_THREAD) sch[tensor_c].bind(bx, tvm.thread_axis("blockIdx.x")) sch[tensor_c].bind(tx, tvm.thread_axis("threadIdx.x")) return sch
def default_schedule(outs): """ default schedule function. Args: outs (Union[tvm.tensor.Tensor, list[tvm.tensor.Tensor]]): outputs of compute. Returns: sch (schedule.Schedule): The created schedule. """ if not isinstance(outs, tvm.tensor.Tensor) and not isinstance(outs, list): raise ValueError( "outs should be list of akg.tvm.tensor.Tensor or akg.tvm.tensor.Tensor" ) device = 'cuda' ctx = tvm.context(device, 0) if not ctx.exist: raise SystemError("Skip because %s is not enabled" % device) outs_list = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs with tvm.target.create(device): sch = tvm.create_schedule(outs_list[0].op) outputs_tensor = Queue() outputs_tensor.put(outs_list[0]) op_list = [] while not outputs_tensor.empty(): out = outputs_tensor.get() if out.op not in op_list and isinstance(out.op, tvm.tensor.ComputeOp): op_list.append(out.op) for input_tensor in out.op.input_tensors: outputs_tensor.put(input_tensor) for op in op_list: stage = sch[op.output(0)] bx, tx = stage.split(op.axis[0], factor=DEFAULT_GPU_THREAD) stage.bind(bx, tvm.thread_axis("blockIdx.x")) stage.bind(tx, tvm.thread_axis("threadIdx.x")) return sch