Пример #1
0
def correction_mul(x, batch_std, running_std, y, channel, kernel_name="correction_mul"):
    """CorrectionMul op"""
    shape = x.get("shape")
    data_format = x.get("format")
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
    check_list = ["float16", "float32"]
    inp_dtype = x.get("dtype").lower()
    if not inp_dtype in check_list:
        raise RuntimeError("Dtype of input only support float16, float32")

    # shape = util.shape_refine(shape)
    x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype)
    shape_c = [1] * len(shape)
    shape_c[channel] = batch_std.get("ori_shape")[0]
    if data_format == "NC1HWC0" and channel == 1:
        shape_c = batch_std.get("shape")
    batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype)
    running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype)
    res = correction_mul_compute(x_t, batch_std_t, running_std_t, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": [x_t, batch_std_t, running_std_t, res]}

    te.lang.cce.cce_build_code(sch, config)
Пример #2
0
def fake_learned_scale_quant_perchannel_grad_d_reduce(
        dout_alpha,
        dalpha,
        channel_axis,
        kernel_name="fake_learned_scale_quant_perchannel_grad_d_reduce"):
    """FakeLearnedScaleQuantPerChannelGradDReduce"""

    dout_alpha_shape = dout_alpha.get("shape")
    dout_alpha_dtype = dout_alpha.get("dtype")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(dout_alpha_shape)
    util.check_tensor_shape_size(dout_alpha_shape)

    check_list = ["float32", 'float16']
    dout_alpha_dtype = dout_alpha_dtype.lower()
    util.check_dtype_rule(dout_alpha_dtype, check_list)

    dout_alpha_data = tvm.placeholder(dout_alpha_shape,
                                      name="dout_alpha",
                                      dtype=dout_alpha_dtype)
    res = fake_learned_scale_quant_perchannel_grad_d_reduce_compute(
        dout_alpha_data, dout_alpha, channel_axis, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [dout_alpha_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #3
0
def clip_boxes_d(boxes_input, boxes_output, img_size, kernel_name="clip_boxes"):
    """
    the External interface function
    input:
      boxes_input: an dict, include shape, and dtype of input
      boxes_output: an dict, include shape, and dtype of output
      img_w: width of the image
      img_h: height of the image
      kernel_name: the kernel name
    return:
      the tik container
    """

    if len(img_size) != CONFIG_TWO:
        raise RuntimeError("img_size should be [img_h, img_w]!")

    img_h, img_w = img_size
    check_clip_boxes_input_dict(boxes_input, boxes_output)
    check_clip_boxes_input_attr(img_w, img_h)

    if len(kernel_name) > util.MAX_KERNEL_NAEM_LEN:
        raise RuntimeError("kernel_name len must be less than 200!")
    util.check_kernel_name(kernel_name)

    tik_instance = clip_boxes_d_compute(boxes_input, img_w, img_h, kernel_name=kernel_name)
    return tik_instance
Пример #4
0
def fill_v2_d(y, value, shape, kernel_name="fill_v2_d"):
    """
    interface of fill_v2_d
    :param y: output
    :param value: value to fill the shape, float32
    :param shape: list int, output shape
    :param kernel_name: fill_v2_d
    :return:
    """
    # check kernel name
    util.check_kernel_name(kernel_name)
    # shape to list
    shape = te.lang.cce.util.shape_to_list(shape)
    util.check_shape_rule(shape)

    # pseudo input, won't be used.
    data_x = tvm.placeholder(shape, dtype="float32", name="data_x")

    # do compute
    res = fill_v2_compute(data_x, value, shape, y, kernel_name)

    # new schedule
    schedule = [tvm.create_schedule(res.op)]
    elewise_sch = te.lang.cce.te_schedule.cce_schedule.ElewiseSchedule()
    elewise_sch._get_emit_insn_map = types.MethodType(_get_emit_insn_map, elewise_sch)
    elewise_sch._do_buffer_tile = types.MethodType(_do_buffer_tile, elewise_sch)
    elewise_sch.do_schedule([res], schedule, [])
    schedule = schedule[0]
    schedule.cce_special = {"tensor_list": (), "orign_out_tensor": [res], "real_out_tensor": [res]}

    # build operater
    config = {"name": kernel_name,
              "tensor_list": (data_x, res)}
    te.lang.cce.cce_build_code(schedule, config)
Пример #5
0
def minmax_update_perchannel(x,
                             min_val,
                             max_val,
                             min_up,
                             max_up,
                             ema,
                             ema_decay,
                             channel_axis,
                             kernel_name="minmax_update_perchannel"):
    """MinMaxUpdatePerChannel op"""
    x_shape = x.get("ori_shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")
    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
    if channel_axis == 0 and x_shape[0] != min_shape[0] and x_shape[
            1] == min_shape[0]:
        channel_axis_ = 1
    else:
        channel_axis_ = channel_axis
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis_])
    util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis_])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    if channel_axis_ == 0:
        shape_c = min_val.get("ori_shape")
    else:
        shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]]
    input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res_list = minmax_update_perchannel_compute(input_data, min_data, max_data,
                                                ema, ema_decay, channel_axis_)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #6
0
def fake_quant_per_layer(x,
                         min_val,
                         max_val,
                         y,
                         symmetric,
                         narrow_range,
                         num_bits,
                         kernel_name="fake_quant_per_layer"):
    """FakeQuantPerLayer"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    quant_min = 0
    quant_max = 2**num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res = fake_quant_per_layer_compute(input_data, min_data, max_data, y,
                                       quant_min, quant_max, symmetric,
                                       kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, min_data, max_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #7
0
def custom_Concat(shapes, dtype, axis, kernel_name="concat", need_build=False, need_print=False):
    """
    concat one or two input data

    Parameters
    ----------
    shapes : input shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, support uint8, int8, int32, float16, float32

    axis : concat axis

    kernel_name : cce kernel name, default value is "concat"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """

    
    util.check_kernel_name(kernel_name)

    for i in range(len(shapes)):
        util.check_shape_rule(shapes[i])

    sum_dim = 0
    for shape in shapes:
        sum_dim += functools_reduce(lambda x, y: x*y, shape)

    if sum_dim > 2**31-1:
        raise RuntimeError("shape exceed 32bit limitation")

    check_list = ["uint8", "int8", "float16", "float32", "int32"]
    if not (dtype.lower() in check_list):
        raise RuntimeError(
            "concat_cce only support %s while dtype is %s" % (",".join(check_list), dtype))

    inp_dtype = dtype.lower()
    data = []
    for i in range(len(shapes)):
        shape = shapes[i]
        data.append(tvm.placeholder(shape, name="data_%d" % i, dtype=inp_dtype))

    with tvm.target.cce():
        res = te.lang.cce.concat(data, axis)
        sch = generic.auto_schedule(res)

    data.append(res)

    config = {"print_ir": need_print,
              "need_build": need_build,
              "name": kernel_name,
              "tensor_list": data}

    te.lang.cce.cce_build_code(sch, config)
Пример #8
0
def sqrt(input_x, output_y, kernel_name="sqrt"):
    """
    calculating data

    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    output_y : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "sqrt"

    Returns
    -------
    None
    """

    """
    TODO:
    Please refer to the TE DSL Manual, And code here with TE DSL.
    """

    """
    TODO:
    operator check
    """

    """
    TODO:
    operator compute, invoke sqrt_compute
    """
    print("=================当你看到这句话时,说明我这个自定义sqrt算子被执行了============================")
    shape = input_x.get("shape")
    dtype = input_x.get("dtype")
    input_dtype = dtype.lower()

    util.check_shape_rule(shape)
    util.check_tensor_shape_size(shape)
    util.check_kernel_name(kernel_name)

    data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype)
    res = sqrt_compute(data_input, output_y, kernel_name)

    """
    TODO:
    auto schedule
    """
    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    """
    TODO:
    operator build
    """
    config = {"name": kernel_name,
              "tensor_list": [data_input, res]}

    te.lang.cce.cce_build_code(schedule, config)
Пример #9
0
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False,
                 need_print=False):
    """
    do element-wise equal operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input x

    shape_y : shape of input y

    dtype : source data type, support float16,float32,int32,int8,uint8

    kernel_name : cce kernel name, default value is "cce_tf_equal"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"]

    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)

    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    x_tmp = te.lang.cce.broadcast(x, shape_max)
    y_tmp = te.lang.cce.broadcast(y, shape_max)

    res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res')

    sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [x, y, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [x, y, res], "cce", name=kernel_name)
def decode_cornerpoints_target_bg(keypoints_prediction,
                                  anchors,
                                  keypoints_decoded,
                                  kernel_name="decode_cornerpoints_target_bg"):
    """
    The params check function of decode_cornerpoints_target_bg

    Parameters:
    ----------
    Returns : All transformed params.
    ----------
    """
    tik_instance = tik.Tik(tik.Dprofile(), True)

    util.check_kernel_name(kernel_name)

    check_decode_cornerpoints_target_bg_params(keypoints_prediction, anchors,
                                               keypoints_decoded)
    init_shape = InitShape(keypoints_prediction, anchors, keypoints_decoded)

    total_handling_times, last_handling_n = tiling_func(init_shape.shape_x[0])

    init_first_tensor = InitFirstTensor(tik_instance, init_shape)

    with tik_instance.for_range(0, total_handling_times -
                                CONFIG_ONE) as current_handling_times:
        n_x = SINGLE_N_MAX

        init_number = InitNumber(n_x)

        with tik_instance.new_stmt_scope():
            init_second_tensor = InitsecondTensor(tik_instance, init_shape,
                                                  init_number)

            init_third_tensor = InitThirdTensor(tik_instance, init_shape,
                                                init_number)

            calculate_process(tik_instance, init_number, init_first_tensor,
                              init_second_tensor, init_third_tensor,
                              current_handling_times)

    n_x = last_handling_n
    init_number = InitNumber(n_x)

    with tik_instance.new_stmt_scope():
        init_second_tensor = InitsecondTensor(tik_instance, init_shape,
                                              init_number)

        init_third_tensor = InitThirdTensor(tik_instance, init_shape,
                                            init_number)

        calculate_process(tik_instance, init_number, init_first_tensor,
                          init_second_tensor, init_third_tensor,
                          total_handling_times - CONFIG_ONE)

    tik_instance.BuildCCE(
        kernel_name=kernel_name,
        inputs=[init_first_tensor.data_x, init_first_tensor.data_y],
        outputs=[init_first_tensor.data_z])
Пример #11
0
def custom_subtract(shape_x,
                    shape_y,
                    dtype,
                    kernel_name="cce_subtract",
                    need_build=True,
                    need_print=True):
    """
    do element-wise subtract operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input data1

    shape_y : shape of input data2

    dtype : source data type, support float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_subtract"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)
    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_subtract_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))
    print("######## shape")
    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)
    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    data1 = tvm.placeholder(shape_x, dtype=dtype, name="data1")
    data2 = tvm.placeholder(shape_y, dtype=dtype, name="data2")

    with tvm.target.cce():
        data1_tmp1 = te.lang.cce.broadcast(data1, shape_max)
        data2_tmp1 = te.lang.cce.broadcast(data2, shape_max)
        res = te.lang.cce.vsub(data1_tmp1, data2_tmp1)
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": [data1, data2, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Пример #12
0
    def check_param(self):
        """
        check the parameters
        :param var_out:
        :return:
        """
        var_out_shape = self.var_out.get("shape")
        var_out_dtype = self.var_out.get("dtype").lower()
        if var_out_dtype == "bool":
            var_out_dtype = "int8"
        util.check_kernel_name(self.kernel_name)
        util.check_shape_rule(self.var_shape)
        util.check_shape_rule(self.indices_shape)
        util.check_shape_rule(self.updates_shape)
        util.check_shape_rule(var_out_shape)

        util.check_tensor_shape_size(self.var_shape)
        util.check_tensor_shape_size(self.indices_shape)
        util.check_tensor_shape_size(self.updates_shape)
        util.check_tensor_shape_size(var_out_shape)

        check_list_var = ("float16", "float32", "int32", "int8", "uint8")
        check_list_indices = "int32"
        util.check_dtype_rule(self.var_dtype, check_list_var)
        util.check_dtype_rule(self.indices_dtype, check_list_indices)
        util.check_dtype_rule(self.updates_dtype, check_list_var)
        util.check_dtype_rule(var_out_dtype, check_list_var)

        if var_out_shape != self.var_shape:
            raise RuntimeError(
                "var_out's shape must be the same as var's shape")

        if (self.updates_dtype != self.var_dtype
                or var_out_dtype != self.var_dtype):
            raise RuntimeError(
                "updates's datatype and var_out's datatype must be the"
                " same as var's datatype")

        if self.nd_flag:
            if len(self.indices_shape) < 2:
                raise RuntimeError(
                    "the lenth of indices_shape must be large than 2")
            k = self.indices_shape[-1]
            updates_len = len(self.indices_shape) - 1 + len(self.var_shape) - k
            if k > len(self.var_shape):
                raise RuntimeError(
                    "indices_shape[-1] can not be large than var's rank")
            if len(self.updates_shape) != updates_len:
                raise RuntimeError("the lenth of update must be len(indices_"
                                   "shape)-1+len(var_shape)-indices_shape[-1]")
            updates_true_shape = self.indices_shape[:-1] + self.var_shape[k:]
        else:
            updates_true_shape = self.var_shape[:self.
                                                axis] + self.indices_shape + self.var_shape[
                                                    self.axis + 1:]

        if self.updates_shape != updates_true_shape:
            raise RuntimeError("updates's shape is illegal")
Пример #13
0
def fake_learned_scale_quant_perlayer(
        input_x,
        alpha,
        quant_max,
        out,
        neg_trunc,
        kernel_name="fake_learned_scale_quant_perlayer"):
    """FakeLearnedScaleQuantPerLayer"""
    input_shape = input_x.get("shape")
    input_dtype = input_x.get("dtype")
    alpha_shape = alpha.get("ori_shape")
    alpha_dtype = alpha.get("dtype")
    quant_max_shape = quant_max.get("ori_shape")
    quant_max_dtype = quant_max.get("dtype")

    alpha_shape = util.scalar2tensor_one(alpha_shape)
    quant_max_shape = util.scalar2tensor_one(quant_max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(alpha_shape, 1, 1, 1)
    util.check_shape_rule(quant_max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(alpha_shape)
    util.check_tensor_shape_size(quant_max_shape)

    check_list = ["float32", "float16"]
    input_dtype = input_dtype.lower()
    alpha_dtype = alpha_dtype.lower()
    quant_max_dtype = quant_max_dtype.lower()
    util.check_dtype_rule(input_dtype, check_list)
    util.check_dtype_rule(alpha_dtype, check_list)
    util.check_dtype_rule(quant_max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )

    input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
    alpha_data = tvm.placeholder(alpha_shape,
                                 name="alpha_data",
                                 dtype=alpha_dtype)
    quant_max_data = tvm.placeholder(quant_max_shape,
                                     name="quant_max_data",
                                     dtype=quant_max_dtype)
    res = fake_learned_scale_quant_perlayer_compute(input_data, alpha_data,
                                                    quant_max_data, neg_trunc,
                                                    kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, alpha_data, quant_max_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list,
        "bool_storage_as_1bit": False
    }

    te.lang.cce.cce_build_code(sch, config)
def fake_quant_perchannel(x, min_val, max_val, y,
                          symmetric, narrow_range, num_bits, channel_axis,
                          kernel_name="fake_quant_perchannel"):
    """FakeQuantPerChannel"""
    x_shape = x.get("shape")
    x_shape_ = x.get("ori_shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")
    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
    if channel_axis == 0 and x_shape_[0] != min_shape[0] and x_shape_[1] == min_shape[0]:
        channel_axis_ = 1
    else:
        channel_axis_ = channel_axis
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape_[channel_axis_])
    util.check_shape_rule(max_shape, 1, 1, x_shape_[channel_axis_])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    quant_min = 0
    quant_max = 2 ** num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    shape_c = [1] * len(x_shape)
    shape_c[channel_axis_] = min_val.get("ori_shape")[0]
    if x_format == "NC1HWC0" and channel_axis_ == 1:
        shape_c = min_val.get("shape")
    input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res = fake_quant_perchannel_compute(input_data, min_data, max_data, y,
                                        quant_min, quant_max, symmetric, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, min_data, max_data, res]
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
Пример #15
0
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"):
    """
    calculate the backpropagation of leaky_relu operation
    y = gradients(x>0) or negative_slope*gradients(x<=0).
    support dtype:float16,float32

    Parameters
    ----------
    g : dict
        the backpropagated gradients to the corresponding leaky_relu operation
    x : dict
        the x passed as output of leaky_relu operation
    y : dict
        the output of leaky_relu back propagation
    negative_slope : float or int
        allow non-zero slope for negative inputs to speed up optimization
    kernel_name : str
        kernel name, default value is "leaky_relu_grad"

    Returns
    -------
    None
    """

    shape_g = g.get("shape")
    shape_x = x.get("shape")
    dtype_g = g.get("dtype").lower()
    dtype_x = x.get("dtype").lower()

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_g)
    util.check_shape_rule(shape_x)
    util.check_tensor_shape_size(shape_g)
    util.check_tensor_shape_size(shape_x)

    shape_list = util.produce_shapes(shape_g, shape_x)
    util.check_tensor_shape_size(shape_list[2])

    # check input tensor data_type
    check_list = ["float16", "float32"]
    util.check_dtype_rule(dtype_g, check_list)
    util.check_dtype_rule(dtype_x, check_list)
    util.compare_tensor_dict_key(g, x, "dtype")

    shape_g, shape_x = refine_shapes_for_broadcast(shape_list[0],
                                                   shape_list[1])
    data_g = tvm.placeholder(shape_g, name="data_g", dtype=dtype_g)
    data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_g)
    res = leaky_relu_grad_compute(data_g, data_x, y, negative_slope,
                                  kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_g, data_x, res]}

    te.lang.cce.cce_build_code(schedule, config)
Пример #16
0
def custom_l2_loss(shape,
                   dtype,
                   kernel_name="cce_tf_l2_loss",
                   need_build=False,
                   need_print=False):
    """
    Computes half the L2 norm of a tensor without the sqrt:
    output = sum(t ** 2) / 2

    Parameters
    ----------
    shape : shape of data

    dtype : source data type, only support float16, float32

    kernel_name : cce kernel name, default value is "cce_reductionLayer"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    util.check_reduce_shape_rule(shape)
    check_list = ["float16", "float32"]
    if not dtype.lower() in check_list:
        raise RuntimeError("tf_l2_loss_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    shape, axis = util.simplify_axis_shape(shape, range(len(shape)))

    inp_dtype = dtype.lower()
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    coeff_sqrt = tvm.const(1.0 / (2**(0.5)), dtype=inp_dtype)

    data_mul = te.lang.cce.vmuls(data_input, coeff_sqrt)
    data_sqr = te.lang.cce.vmul(data_mul, data_mul)
    res = te.lang.cce.sum(data_sqr, axis)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": [data_input, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Пример #17
0
def custom_sign(shape,
                dtype,
                kernel_name="cce_custom_sign",
                need_build=False,
                need_print=False):
    """
                                  x*32768
    algrithm: sign = round(-------------------------)
                            2 ** (-15) + |x*32768|

    calculating data type is float16

    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype,
            only support float16, float32, int32

    kernel_name : cce kernel name, default value is "cce_sign"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "custom_sign_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))

    shape = util.shape_refine(shape)
    inp_dtype = dtype.lower()
    data = tvm.placeholder(shape, name="data", dtype=inp_dtype)
    with tvm.target.cce():
        res = custom_sign_compute([data], shape, dtype, kernel_name,
                                  need_build, need_print)

        sch = generic.auto_schedule(res)

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": [data, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Пример #18
0
def correction_mul_grad(dout, x, batch_std, running_std, dx, mul_dx, channel, kernel_name="correction_mul_grad"):
    """CorrectionMulGrad op"""
    shape_dout = dout.get("shape")
    shape_x = dout.get("shape")

    dtype_dout = dout.get("dtype")
    dtype_x = x.get("dtype")
    dtype_batch_std = batch_std.get("dtype")
    dtype_running_std = running_std.get("dtype")

    inp_dtype_dout = dtype_dout.lower()
    inp_dtype_x = dtype_x.lower()
    inp_dtype_batch_std = dtype_batch_std.lower()
    inp_dtype_running_std = dtype_running_std.lower()

    util.check_dtype_rule(inp_dtype_dout, ("float16", "float32"))
    util.check_dtype_rule(inp_dtype_x, ("float16", "float32"))
    util.check_dtype_rule(inp_dtype_batch_std, ("float16", "float32"))
    util.check_dtype_rule(inp_dtype_running_std, ("float16", "float32"))
    util.compare_tensor_dict_key(dout, x, "dtype")
    util.compare_tensor_dict_key(dout, x, "shape")
    util.compare_tensor_dict_key(dx, x, "shape")
    util.compare_tensor_dict_key(batch_std, running_std, "shape")
    util.compare_tensor_dict_key(dx, mul_dx, "shape")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)

    data_format = dout.get("format")
    ori_format = dout.get("format")
    if data_format.upper() not in ("NC1HWC0", "NCHW"):
        raise RuntimeError("Un supported data format {}".format(data_format))
    if data_format.upper() == "NCHW" and ori_format != "NCHW":
        raise RuntimeError("data_format(NCHW) must same as ori_format")

    shape_c = [1] * len(shape_x)
    shape_c[channel] = batch_std.get("ori_shape")[0]
    if data_format == "NC1HWC0" and channel == 1:
        shape_c = batch_std.get("shape")

    dout_t = tvm.placeholder(shape_dout, name="dout", dtype=inp_dtype_dout)
    x_t = tvm.placeholder(shape_x, name="x", dtype=inp_dtype_x)
    batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype_batch_std)
    running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype_running_std)
    res_list = correction_mul_grad_compute(dout_t, x_t, batch_std_t, running_std_t, channel, data_format, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + res_list
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
Пример #19
0
def custom_logical_not(shape,
                       dtype,
                       kernel_name="cce_tf_logical_not",
                       need_build=False,
                       need_print=False):
    """
    logical not for the input tensor

    Parameters
    ----------
    shape : input shape of data

    dtype : the data type, support bool

    kernel_name : cce kernel name, default value is "cce_logical_not"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)

    check_list = ["bool"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "logical_not_cce ony supports %s while dtype is %s" %
            (",".join(check_list), dtype))

    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    data = tvm.placeholder(shape, name="data", dtype=inp_dtype)

    with tvm.target.cce():

        result = tvm.compute(
            shape,
            lambda *i: tvm.select(data[i] is True, False, True),
            name="result")

        schedule = tvm.create_schedule(result.op)

        if need_print:
            with build_config:
                print(tvm.lower(schedule, [data, result], simple_mode=True))
        if need_build:
            with build_config:
                tvm.build(schedule, [data, result], "cce", name=kernel_name)
Пример #20
0
def minmax_update_perlayer(x,
                           min_val,
                           max_val,
                           min_up,
                           max_up,
                           ema,
                           ema_decay,
                           kernel_name="minmax_update_perlayer"):
    """MinMaxUpdatePerLayer op"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res_list = minmax_update_perlayer_compute(input_data, min_data, max_data,
                                              ema, ema_decay)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #21
0
def hwcn_2_fractal_z_c04(src,
                         dst,
                         src_format,
                         dst_format,
                         kernel_name="hwcn_2_fractal_z_c04"):
    """
    algorithm: hwcn_2_fractal_z_c04

    Parameters
    ----------
    src: dict
        dict with keys(shape, dtype) of src
    dst: dict
        dict with keys(shape, dtype) of dst
    src_format: str
        data format of src
    dst_format: str
        data format of dst
    kernel_name: str
        kernel name, default value is "hwcn_2_fractal_z_c04"

    Returns
    -------
    tik_instance: tik_instance
    """
    src_shape = src.get("shape")
    src_dtype = src.get("dtype").lower()
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(src_shape)
    util.check_tensor_shape_size(src_shape)
    check_list = ("float16")
    util.check_dtype_rule(src_dtype, check_list)
    if len(src_shape) != 4:
        raise RuntimeError("hwcn_2_fractal_z_c04 only support 4D "
                           "while src shape is %s" % ", ".join(src_shape))

    if src_shape[2] > 4:
        raise RuntimeError("hwcn_2_fractal_z_c04 only support C <= 4 "
                           "while src shape is %s" % ", ".join(src_shape))

    if src_format.upper() != "HWCN":
        raise RuntimeError("hwcn_2_fractal_z_c04 only support %s "
                           "while src format is %s" % ("HWCN", src_format))

    if dst_format.upper() != "FRACTAL_Z_C04":
        raise RuntimeError("hwcn_2_fractal_z_c04 only support %s "
                           "while dst format is %s" %
                           ("FRACTAL_Z_C04", dst_format))

    src_shape = list(src_shape)

    hwcn_2_fractal_z_c04_template = HWCN2FRACTALZC04Compute(
        src_shape, src_dtype, kernel_name)
    return hwcn_2_fractal_z_c04_template.get_tik_instance()
Пример #22
0
def addcdiv(x1, x2, x3, y=None, alpha=1.0, kernel_name="addcdiv"):

    check_list = ("float16", "float32")

    shape_x1 = x1.get("shape")
    dtype_x1 = x1.get("dtype").lower()

    shape_x2 = x2.get("shape")
    dtype_x2 = x2.get("dtype").lower()

    shape_x3 = x3.get("shape")
    dtype_x3 = x3.get("dtype").lower()

    util.check_shape_rule(shape_x1)    # 校验算子的shape,维度数需要大于等于1、小于等于8
    util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT)    # 校验算子第一个输入shape大小
    util.check_dtype_rule(dtype_x1, check_list)    # 校验算子的输入数据类型

    util.check_shape_rule(shape_x2)
    util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT)
    util.check_dtype_rule(dtype_x2, check_list)

    util.check_shape_rule(shape_x3)
    util.check_shape_size(shape_x3, SHAPE_SIZE_LIMIT)
    util.check_dtype_rule(dtype_x3, check_list)

    if dtype_x1 != dtype_x2 or dtype_x1 != dtype_x3:
        raise RuntimeError("the type of x1, x2, x3 must be the same!")

    util.check_kernel_name(kernel_name)    # 校验算子的kernel_name

    # 取shape_x1,shape_x2,shape_x3中每个维度的大值赋给shape_max
    shape_x2, shape_x3, shape_max = broadcast_shapes(shape_x2, shape_x3)
    util.check_tensor_shape_size(shape_max)     # 对shape_max进行校验
    shape_x1, _, shape_max = broadcast_shapes(shape_x1, shape_max)
    util.check_tensor_shape_size(shape_max)     # 对shape_max进行校验
    shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max)    # 将input_x的shape广播为shape_max
    shape_x3, _, _ = broadcast_shapes(shape_x3, shape_max)    # 将input_y的shape广播为shape_max

    data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype_x1)
    data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype_x2)
    data_x3 = tvm.placeholder(shape_x3, name="data_x3", dtype=dtype_x3)

    res = addcdiv_compute(data_x1, data_x2, data_x3, shape_max, alpha, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": [data_x1, data_x2, data_x3, res]}

    te.lang.cce.cce_build_code(schedule, config)
Пример #23
0
def _shape_and_dtype_check(x, y_grad, target, weight, total_weight, reduction,
                           kernel_name):
    x_shape = x.get("shape")
    x_dtype = x.get("dtype").lower()
    y_grad_shape = y_grad.get("shape")
    y_grad_dtype = y_grad.get("dtype").lower()
    target_shape = target.get("shape")
    target_dtype = target.get("dtype").lower()
    total_weight_shape = total_weight.get("shape")
    total_weight_dtype = total_weight.get("dtype").lower()
    weight_shape = weight.get("shape")
    weight_dtype = weight.get("dtype").lower()
    util.check_tensor_shape_size(weight_shape)
    util.check_shape_rule(weight_shape)

    util.check_shape_rule(x_shape)
    util.check_shape_rule(y_grad_shape)
    util.check_shape_rule(target_shape)
    util.check_tensor_shape_size(y_grad_shape)
    util.check_tensor_shape_size(target_shape)

    util.check_kernel_name(kernel_name)
    util.check_dtype_rule(x_dtype, "float32")
    util.check_dtype_rule(y_grad_dtype, "float32")
    util.check_dtype_rule(target_dtype, "int32")
    util.check_dtype_rule(weight_dtype, "float32")
    util.check_dtype_rule(total_weight_dtype, "float32")

    if reduction in ("mean", "sum") and y_grad_shape[0] != 1:
        raise RuntimeError("The shape of y_grad must be (1,),"
                           " while reduction is mean or sum. ")
    if len(x_shape) == 1 and y_grad_shape[0] != 1:
        raise RuntimeError("The shape of y_grad must be (1,),"
                           " while input x is 1D. ")
    if len(x_shape) > DIM2:
        raise RuntimeError("The dimension of x should be equal to"
                           "or less than two.")
    if len(x_shape) == DIM2 and x_shape[0] != target_shape[0]:
        raise RuntimeError("The first dimension of x and"
                           " target should be equal")
    if x_shape[-1] != weight_shape[0]:
        raise RuntimeError("The last dimension of x and the first dimension"
                           " of weight should be equal")
    if len(y_grad_shape) != 1:
        raise RuntimeError("The dimension of y_grad should be 1D.")
    if len(weight_shape) != 1:
        raise RuntimeError("The dimension of weight should be 1D.")
    if len(target_shape) != 1:
        raise RuntimeError("The dimension of target should be 1D.")
    if total_weight_shape[0] != 1:
        raise RuntimeError("The shape of total_weight must be (1,)")
Пример #24
0
def batchnorm_fold2(x,
                    beta,
                    gamma,
                    batch_std,
                    batch_mean,
                    running_std,
                    y,
                    kernel_name="batchnorm_fold2"):
    """_BatchNormFold2 op"""
    shape = x.get("shape")
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
    check_list = ["float16", "float32"]
    inp_dtype = x.get("dtype").lower()
    if not inp_dtype in check_list:
        raise RuntimeError("Dtype of input only support float16, float32")
    data_format = x.get("format")
    ori_format = x.get("ori_format")
    if data_format.upper() not in ("NC1HWC0", "NCHW"):
        raise RuntimeError("Un supported data format {}".format(data_format))
    if data_format.upper() == "NCHW" and ori_format != "NCHW":
        raise RuntimeError("data_format(NCHW) must same as ori_format")
    shape_c = gamma.get("shape")
    if gamma.get("format").upper() == "NCHW":
        shape_c = 1, gamma.get("shape")[0], 1, 1
    x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype)
    beta_t = tvm.placeholder(shape_c, name="beta", dtype=inp_dtype)
    gamma_t = tvm.placeholder(shape_c, name="gamma", dtype=inp_dtype)
    batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype)
    batch_mean_t = tvm.placeholder(shape_c, name="batch_mean", dtype=inp_dtype)
    running_std_t = tvm.placeholder(shape_c,
                                    name="running_std",
                                    dtype=inp_dtype)

    res = batchnorm_fold2_compute(x_t, beta_t, gamma_t, batch_std_t,
                                  batch_mean_t, running_std_t, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "print_ir":
        False,
        "name":
        kernel_name,
        "tensor_list":
        [x_t, beta_t, gamma_t, batch_std_t, batch_mean_t, running_std_t, res]
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #25
0
def decode_cornerpoints_target_wrt_center_v1(
        keypoints_prediction,
        anchors,
        keypoints_decoded,
        kernel_name="cce_decode_cornerpoints_target_wrt_center_v1_float16"):
    """
    The params check function of decode_wheels_target

    Parameters:
    ----------
    Returns : All transformed params.
    ----------
    """
    check_decode_cornerpoints_target_wrt_center_v1_shape_params(
        keypoints_prediction,
        anchors,
        keypoints_decoded)
    util.check_kernel_name(kernel_name)
    shape_x = keypoints_prediction.get("shape")

    tik_instance = tik.Tik(tik.Dprofile(), True)
    core_num = tik.Dprofile().get_aicore_num()
    tiling = Tiling(shape_x[0], core_num)
    # gm_tensor init
    gm_tensor = InitTensor(tik_instance, shape_x, [shape_x[0], FOUR], 'float16')
    if tiling.factor > 0:
        thread_num = TWO if tiling.factor != ONE else ONE
        with tik_instance.for_range(0, core_num, block_num=core_num) as current_core:
            with tik_instance.for_range(0, tiling.factor, thread_num=thread_num) as current_factor:
                shape = InitShape(SINGLE_N_MAX)
                current_data_x = EIGHT * SINGLE_N_MAX * (current_core + core_num * current_factor)
                current_data_y = FOUR * SINGLE_N_MAX * (current_core + core_num * current_factor)
                calculate_process(tik_instance, gm_tensor, shape, current_data_x, current_data_y)
    if tiling.last_core > 0:
        thread_num = TWO if tiling.last_core != ONE else ONE
        with tik_instance.for_range(0, tiling.last_core, thread_num=thread_num) as current_core:
            shape = InitShape(SINGLE_N_MAX)
            current_data_x = EIGHT * SINGLE_N_MAX * (core_num * tiling.factor + current_core)
            current_data_y = FOUR * SINGLE_N_MAX * (core_num * tiling.factor + current_core)
            calculate_process(tik_instance, gm_tensor, shape, current_data_x, current_data_y)
    if tiling.last_n > 0:
        shape = InitShape(tiling.last_n)
        current_data_x = EIGHT * SINGLE_N_MAX * (core_num * tiling.factor + tiling.last_core)
        current_data_y = FOUR * SINGLE_N_MAX * (core_num * tiling.factor + tiling.last_core)
        calculate_process(tik_instance, gm_tensor, shape, current_data_x, current_data_y)

    # build_cce
    tik_instance.BuildCCE(
        kernel_name=kernel_name,
        inputs=[gm_tensor.data_x, gm_tensor.data_y],
        outputs=[gm_tensor.data_z])
Пример #26
0
def custom_Tile(shape, dtype, tiles, axis=1, kernel_name="cce_caffe_tile_layer",
                         need_build=False, need_print=False):
    """Operation and Schedule for tilelayer, construct an array by axis and tiles.

    Parameters
    ----------
    shape: shape of Tensor

    dtype: the data type. only support float16, float32, int32, int8, uint8

    tiles: the number of copies (tiles) of the tensor to output

    axis: the index of the axis to tile

    kernel_name: cce kernel name, default value is "cce_caffe_tile_layer"

    need_buid: if need to build CCEC kernel, default value is False

    need_print: if need to print the ir, default value is False

    Returns
    -------
        None
    """
    check_list = ["float16", "float32", "int32", "int8", "uint8"]
    if not (dtype.lower() in check_list):
        raise RuntimeError(
            "caffe_tile_layer only support %s while dtype is %s" % (",".join(check_list), dtype))

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)

    if type(axis) != int:
        raise RuntimeError("type of axis value should be int")
    if axis >= len(shape) or axis < -len(shape):
        raise RuntimeError(
            "input axis is out of range, axis value can be from %d to %d" % (
                -len(shape), len(shape) - 1))

    if type(tiles) != int:
        raise RuntimeError("type of tiles must be int.")
    if tiles < 0:
        raise RuntimeError("Number of tiles must be positive.")

    multiples = [1]*len(shape)

    multiples[axis] = tiles

    tf_tile.tf_tile_cce(shape, dtype, multiples, kernel_name=kernel_name, need_build=need_build,
                        need_print=need_print)
def check_param(x, grad, argmax, y, ksize, strides, padding, dtype, dilation,
                ceil_mode, kernel_name):
    """
    check the parameters is valid, if one is invalid,then raise error
    Parameters
    ----------
    x: dict,shape and datatype
    grad: dict,shape and datatype
    argmax: dict,shape and datatype
    y: dict,shape and datatype
    ksize: kernel or windows size,minimum length is 4,
          just like [1, poolingWindowH, poolingWindowW, 1]
    strides: stride , minimum length is 4, just like
    [1, poolingStrideH, poolingStrideW, 1]
    padding: pad mode
    Returns
    -------
    None
    """
    y_shape = x.get("shape")
    y_dtype = x.get("dtype").lower()
    y_dtype_arg = y.get("dtype").lower()
    input_gard_shape = grad.get("shape")
    grad_dtype = grad.get("dtype").lower()
    argmax_shape = argmax.get("shape")
    argmax_dtype = argmax.get("dtype").lower()
    util.check_shape_rule(y_shape)
    util.check_shape_rule(input_gard_shape)
    util.check_shape_rule(argmax_shape)
    util.check_kernel_name(kernel_name)
    check_shape_5hd(y_shape)
    check_shape_5hd(input_gard_shape)
    util.check_tensor_shape_size(input_gard_shape)
    util.check_tensor_shape_size(argmax_shape)
    util.check_tensor_shape_size(y_shape)
    util.check_dtype_rule(grad_dtype, ("float16", "float32", "int32"))
    util.check_dtype_rule(argmax_dtype, ("uint16"))
    util.check_dtype_rule(y_dtype, ("float16", "float32", "int32"))

    if y_dtype != grad_dtype or y_dtype_arg != y_dtype:
        raise RuntimeError("The dtype of tensor must be same")

    if dtype != DT_INT32 and dtype != DT_INT64:
        raise RuntimeError(
            "The dtype of input max indice must be int32 or int64")

    check_output_dim_with_ksize_stride(padding, input_gard_shape, y_shape,
                                       ksize, strides, dilation, ceil_mode)
Пример #28
0
def batchnorm_fold_grad(d_batch_mean, d_batch_std, x, batch_mean, batch_std, dx,
                        epsilon=1e-5, is_training=True, freeze_bn=0, kernel_name="batchnorm_fold_grad"):
    """batchnorm_fold_grad op """
    util.check_kernel_name(kernel_name)
    for iv in (d_batch_mean, d_batch_std, x, batch_mean, batch_std):
        util.check_shape_rule(iv.get("shape"))
        util.check_tensor_shape_size(iv.get("shape"))
    check_tuple = ("float16", "float32")
    for iv in (d_batch_mean, d_batch_std, x, batch_mean, batch_std):
        util.check_dtype_rule(iv.get("dtype").lower(), check_tuple)

    shape_x = x.get("shape")
    dtype_x = x.get("dtype")
    format_data = x.get("format").upper()
    if format_data not in ("NCHW", "NC1HWC0"):
        raise RuntimeError("Format of input only support 4D and 5HD")

    shape_mean = d_batch_mean.get("shape")
    dtype_mean = d_batch_mean.get("dtype").lower()
    if format_data == "NC1HWC0":
        if len(shape_x) != 5:
            raise RuntimeError("batchnorm_fold only support shape 5D"
                               "when input format is NC1HWC0")
        shape_mean = (1, shape_x[1], 1, 1, shape_x[4])
    elif format_data == "NCHW":
        if len(shape_x) < 2 or len(shape_x) > 4:
            raise RuntimeError("batchnorm_fold only support shape 2D to 4D")
        if shape_x[1] != shape_mean[0]:
            raise RuntimeError("data_format is NCHW, shape_bias must"
                               "be equal to the second axis of shape_x")
        shape_mean = (1, shape_x[1],)
        for _ in range(2, len(shape_x)):
            shape_mean = shape_mean + (1,)

    d_batch_mean = tvm.placeholder(shape_mean, name="d_batch_mean", dtype=dtype_mean)
    d_batch_std = tvm.placeholder(shape_mean, name="d_batch_std", dtype=dtype_mean)
    data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x.lower())
    batch_mean = tvm.placeholder(shape_mean, name="batch_mean", dtype=dtype_mean)
    batch_std = tvm.placeholder(shape_mean, name="batch_std", dtype=dtype_mean)

    res = _batchnorm_fold_grad_compute(d_batch_mean, d_batch_std, data_x, batch_mean, batch_std)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [d_batch_mean, d_batch_std, data_x, batch_mean, batch_std] + res
    config = {"name": kernel_name,
              "tensor_list": tensor_list}
    te.lang.cce.cce_build_code(sch, config)
Пример #29
0
def segment_max_d(x, y, segment_ids, kernel_name="segment_max_d"):
    """
    Operation and Schedule for segment_max


    Parameters
    ----------
    x : dict
        shape and dtype of input
    y: dict
        shape and dtype of output
    segment_ids : list
        should be the size of the first dimension
    kernel_name: str
        kernel name, default value is "segment_max_d"

    Returns
    -------
        None
    """
    shape = x.get("shape")
    dtype = x.get("dtype")
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    if dtype.lower() not in check_list:
        raise RuntimeError("segment_max only support float16, float32, int32")

    # when shape[0] > first_dim_size_threshold,
    # default stack space may not be enough, we need to prompt the user
    if shape[0] > FIRST_DIM_SIZE_THRESHOLD:
        print("Default stack space may not be enough.\
         You shall increase the stack space.")

    dtype = dtype.lower()

    _check_segment_ids(shape, segment_ids)

    input_data = tvm.placeholder(shape, name="input_data", dtype=dtype)
    with tvm.target.cce():
        res = segment_max_d_compute(input_data, y, segment_ids, kernel_name)
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [input_data, res]}
    te.lang.cce.cce_build_code(sch, config)
Пример #30
0
def _check_parameters(src, dst, src_format, dst_format, kernel_name):
    """
    check the parameters including src_shape, dst_shape,
    src_format, dst_format, dtype and kernel_name

    """
    src_shape = src.get("shape")
    dst_shape = dst.get("shape")
    dtype = src.get("dtype")
    dtype_dst = dst.get("dtype")

    if src_format.lower() != "ndhwc":
        raise RuntimeError("src_format must be NDHWC !")

    if dst_format.lower() != "ndc1hwc0":
        raise RuntimeError("dst_format must be NDC1HWC0!")

    util.check_kernel_name(kernel_name)
    check_list = ("float16", )
    util.check_dtype_rule(dtype, check_list)
    if dtype != dtype_dst:
        raise RuntimeError("dtype of src and dst are different !")

    util.check_shape_rule(src_shape, 5, 5)
    util.check_shape_rule(dst_shape, 6, 6)
    util.check_tensor_shape_size(src_shape)
    util.check_tensor_shape_size(dst_shape)

    if dst_shape[5] != 16:
        raise RuntimeError(
            "the last dimension of dst_shape is not 16, c0 must be 16 !")

    if dst_shape[0] != src_shape[0]\
            or (dst_shape[1] != src_shape[1]
                and dst_shape[1] != src_shape[1] + 2)\
            or dst_shape[3] != src_shape[2] or dst_shape[4] != src_shape[3]:
        raise RuntimeError("the shape of src and dst not match, "
                           "the 1st,2nd,4th,5th dimension of dst_shape and "
                           "the 1st,2nd,3rd,4th dimension of src_shape "
                           "must be the same !")
    c_dst = src_shape[4]

    c_1 = dst_shape[2]
    c_0 = dst_shape[5]
    if not ((c_dst <= c_1 * c_0) and (c_dst > (c_1 - 1) * c_0)):
        raise RuntimeError("c must be less than or equal to c1*c0,"
                           "and greater than ((c1 - 1)*c0 )!")