Пример #1
0
def _get_data_gm(shapes, dtype):
    """
    get placeholders of data_dy, data_x, data_variance, data_mean and data_gamma

    Parameters
    ----------
    shapes: dict
        {"shape_dy": shape_dy, "shape_x": shape_x, "shape_var": shape_variance,
         "shape_mean": shape_mean, "shape_gamma": shape_gamma}
    dtype: str
        the data type

    Returns
    -------
    data_gm: tuple
        (data_dy, data_x, data_variance, data_mean, data_gamma)
    """
    data_dy = tvm.placeholder(shapes.get("shape_dy"),
                              name="data_dy", dtype=dtype)
    data_x = tvm.placeholder(shapes.get("shape_x"),
                             name="data_x", dtype=dtype)
    data_variance = tvm.placeholder(shapes.get("shape_var"),
                                    name="data_variance", dtype=dtype)
    data_mean = tvm.placeholder(shapes.get("shape_mean"),
                                name="data_mean", dtype=dtype)
    data_gamma = tvm.placeholder(shapes.get("shape_gamma"),
                                 name="data_gamma", dtype=dtype)

    data_gm = (data_dy, data_x, data_variance, data_mean, data_gamma)

    return data_gm
Пример #2
0
def correction_mul(x, batch_std, running_std, y, channel, kernel_name="correction_mul"):
    """CorrectionMul op"""
    shape = x.get("shape")
    data_format = x.get("format")
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
    check_list = ["float16", "float32"]
    inp_dtype = x.get("dtype").lower()
    if not inp_dtype in check_list:
        raise RuntimeError("Dtype of input only support float16, float32")

    # shape = util.shape_refine(shape)
    x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype)
    shape_c = [1] * len(shape)
    shape_c[channel] = batch_std.get("ori_shape")[0]
    if data_format == "NC1HWC0" and channel == 1:
        shape_c = batch_std.get("shape")
    batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype)
    running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype)
    res = correction_mul_compute(x_t, batch_std_t, running_std_t, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": [x_t, batch_std_t, running_std_t, res]}

    te.lang.cce.cce_build_code(sch, config)
def optional_weight(tensor_list, predict_shape, dtype_list, weight,
                    pos_weight):
    weight_data = None
    pos_weight_data = None
    if weight is not None:
        weight_shape = weight.get("shape")
        weight_dtype = weight.get("dtype").lower()
        op_utils.check_dtype(weight_dtype, dtype_list)
        _broadcast_shape_check(weight_shape, predict_shape)

        weight_shape = tuple(
            [1] *
            (len(predict_shape) - len(weight_shape))) + tuple(weight_shape)
        weight_data = tvm.placeholder(weight_shape,
                                      weight_dtype,
                                      name="weight_data")
        tensor_list.append(weight_data)

    if pos_weight is not None:
        pos_weight_shape = pos_weight.get("shape")
        pos_weight_dtype = pos_weight.get("dtype").lower()

        op_utils.check_dtype(pos_weight_dtype, dtype_list)
        _broadcast_shape_check(pos_weight_shape, predict_shape)

        pos_weight_shape = tuple([1] *
                                 (len(predict_shape) - len(pos_weight_shape))
                                 ) + tuple(pos_weight_shape)
        pos_weight_data = tvm.placeholder(pos_weight_shape,
                                          pos_weight_dtype,
                                          name="pos_weight_data")
        tensor_list.append(pos_weight_data)

    return weight_data, pos_weight_data
Пример #4
0
def squared_difference(x1, x2, y, kernel_name="squared_difference"):
    """
    algorithm: squared_difference

    calculating data's tf_squared_difference,y= (x - y) * (x - y)

    Parameters
    ----------
    x2 : dict
        shape and dtype of y input, only support float16, float32
    input_dy : dict
        shape and dtype of dy input, only support float16, float32
    output_x: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is squared_difference

    Returns
    -------
    None
    """
    shape_x = x1.get("shape")
    shape_y = x2.get("shape")
    check_shape(shape_x, param_name="x1")
    check_shape(shape_y, param_name="x2")

    check_list = ["float16", "float32", "int32"]
    dtype = x1.get("dtype").lower()

    if not dtype in check_list:
        raise RuntimeError(
            "tf_squared_difference_cce only support float16, float32, int32")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="x1",
                                                   param_name_input2="x2")

    shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y)
    data_x = tvm.placeholder(shape_x, dtype=dtype, name="data_x")
    data_y = tvm.placeholder(shape_y, dtype=dtype, name="data_y")

    with tvm.target.cce():
        shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                       shape_y,
                                                       param_name_input1="x1",
                                                       param_name_input2="x2")
        data_x_tmp = te.lang.cce.broadcast(data_x, shape_max)
        data_y_tmp = te.lang.cce.broadcast(data_y, shape_max)
        data_sub = te.lang.cce.vsub(data_x_tmp, data_y_tmp)
        res = te.lang.cce.vmul(data_sub, data_sub)
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_x, data_y, res]
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #5
0
def diag_part_d(x, assist, y, kernel_name="diag_part_d"):
    """
    Returns the batched diagonal part of a batched tensor

    Parameters
    ----------
    x: dict
        dict of x, include keys(shape and dtype)
    assist: dict
        dict of help Matrix, Its Diagonal Line value is 1 else value is 0
    y: dict
        dict of output
    kernel_name: str
        cce kernel name, default value is "diag_part_d"

    Returns
    -------
    None
    """
    shape_x = x.get("shape")
    dtype_x = x.get("dtype")
    shape_assist = assist.get("shape")
    dtype_assist = assist.get("dtype")
    shape_y = y.get("shape")

    check_shape(shape_x, param_name="x")
    check_shape(shape_assist, param_name="assist")

    if len(shape_x) not in (2, 4, 6, 8):
        raise RuntimeError("Input tensors of rank 2,4,6,8 are supported!")
    if list(shape_x) != list(shape_assist):
        raise RuntimeError("the shape of data must be equal!")
    len_shape_out = len(shape_x) // VALUE_TWO
    for i in range(len_shape_out):
        if shape_x[i] != shape_x[i + len_shape_out]:
            raise RuntimeError("the shape of input is not supported!")
    if list(shape_x) != list(shape_y + shape_y):
        raise RuntimeError("the shape of output is not supported!")
    if list(shape_x) != list(shape_assist):
        raise RuntimeError("the shape of data must be equal!")

    check_list = ("float16", "float32", "int32")
    dtype_x = dtype_x.lower()
    check_dtype(dtype_x, check_list, param_name="x")
    dtype_assist = dtype_assist.lower()
    check_dtype(dtype_assist, check_list, param_name="assist")
    if dtype_assist != dtype_x:
        raise RuntimeError("the dtype of data must be equal!")

    data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x)
    data_assist = tvm.placeholder(shape_assist,
                                  name="data_assist",
                                  dtype=dtype_assist)

    res = diag_part_d_compute(data_x, data_assist, y, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_x, data_assist, res]}
    te.lang.cce.cce_build_code(sch, config)
Пример #6
0
def minmax_update_perchannel(x,
                             min_val,
                             max_val,
                             min_up,
                             max_up,
                             ema,
                             ema_decay,
                             channel_axis,
                             kernel_name="minmax_update_perchannel"):
    """MinMaxUpdatePerChannel op"""
    x_shape = x.get("ori_shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")
    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
    if channel_axis == 0 and x_shape[0] != min_shape[0] and x_shape[
            1] == min_shape[0]:
        channel_axis_ = 1
    else:
        channel_axis_ = channel_axis
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis_])
    util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis_])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    if channel_axis_ == 0:
        shape_c = min_val.get("ori_shape")
    else:
        shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]]
    input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res_list = minmax_update_perchannel_compute(input_data, min_data, max_data,
                                                ema, ema_decay, channel_axis_)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #7
0
    def _conv3dbp_input_achieve_with_tvm():
        dedy = tvm.placeholder(shape_dedy,
                               name="dedy",
                               dtype=out_backprop_dtype)
        shape_filter_ncdhw = [
            filter_batch, filter_channel, filter_depth, filter_h, filter_w
        ]

        filters = tvm.placeholder(shape_filter_frac,
                                  name="filter",
                                  dtype=filter_dtype)

        dedx = te.lang.cce.conv3d_backprop_input_compute(
            filters=filters,
            out_backprop=dedy,
            filter_sizes=shape_filter_ncdhw,
            input_sizes=input_sizes,
            strides=strides,
            padding=pads,
            dilations=dilations,
            res_dtype=res_dtype,
            kernel_name=kernel_name)
        tensor_list = [filters, dedy, dedx]

        with tvm.target.cce():
            sch = generic.auto_schedule(dedx)

        config = {"name": kernel_name, "tensor_list": tensor_list}
        te.lang.cce.cce_build_code(sch, config)
Пример #8
0
def fake_quant_per_layer(x,
                         min_val,
                         max_val,
                         y,
                         symmetric,
                         narrow_range,
                         num_bits,
                         kernel_name="fake_quant_per_layer"):
    """FakeQuantPerLayer"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    quant_min = 0
    quant_max = 2**num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res = fake_quant_per_layer_compute(input_data, min_data, max_data, y,
                                       quant_min, quant_max, symmetric,
                                       kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, min_data, max_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #9
0
def bn_infer_grad(grads, scale, batch_variance,
                  x_backprop, epsilon=0.0001,
                  kernel_name="bn_infer_grad"):
    """
    algorithm: fused_batch_norm_grad_v2
    bn_infer_grad.

    Parameters
    ----------
    grads: dict
        dict of grads, A 5D Tensor for input grads.
    scale: dict
        dict of scale, A 5D Tensor for input scale.
    batch_variance: dict
        dict of batch_variance, A 5D Tensor for input batch_variance.
    x_backprop: dict
        dict of x_backprop, A 5D Tensor for output x_backprop.
    epsilon: float
        A small float number added to the variance of x. Defaults to `0.0001`.
    kernel_name: str
        kernel name, default value is "bn_infer_grad"

    Returns
    -------
    None
    """

    shape_grads = grads.get("shape")
    shape_scale = scale.get("shape")
    shape_batch_variance = batch_variance.get("shape")

    input_grads_dtype = grads.get("dtype").lower()
    input_scale_dtype = scale.get("dtype").lower()
    batch_variance_dtype = batch_variance.get("dtype").lower()

    check_dtype(input_grads_dtype, ("float32", "float16"), param_name="grads")
    check_dtype(input_scale_dtype, ("float32",), param_name="scale")
    check_dtype(batch_variance_dtype, ("float32",), param_name="batch_variance")

    _check_shape(shape_grads, shape_batch_variance)
    util.compare_tensor_dict_key(scale, batch_variance, "shape")

    grads_input = tvm.placeholder(shape_grads, name="grads_input",
                                  dtype=input_grads_dtype)
    scale_input = tvm.placeholder(shape_scale, name="x_input",
                                  dtype=input_scale_dtype)
    batch_variance_input = tvm.placeholder(shape_batch_variance,
                                           name="batch_variance_input",
                                           dtype=batch_variance_dtype)

    res = bn_infer_grad_compute(grads_input, scale_input,
                                batch_variance_input,
                                x_backprop, epsilon,
                                kernel_name=kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)
    tensor_list = [grads_input, scale_input, batch_variance_input, res]
    config = {"name": kernel_name,
              "tensor_list": tensor_list}
    te.lang.cce.cce_build_code(sch, config)
Пример #10
0
def logical_or(x1, x2, y, kernel_name="logical_or"):
    """
    algorithm : logical_or
    calculating the value of x1 OR x2 element-wise

    Parameters
    ----------
    x1 : the dict of x1,
         include shape and dtype,
         dtype support int8, the value only support 0, 1

    x2 : the dict of x2,
         include shape and dtype,
         dtype support int8, the value only support 0, 1

    y : the dict of y, include shape and dtype

    kernel_name : string, cce kernel name, default value is "logical_or"

    Returns
    -------
    None
    """

    shape_x1 = x1.get("shape")
    shape_x2 = x2.get("shape")
    dtype_x1 = x1.get("dtype")
    dtype_x2 = x2.get("dtype")
    if dtype_x1 == "bool" or dtype_x2 == "bool":
        dtype_x1 = "int8"
        dtype_x2 = "int8"

    check_shape(shape_x1, param_name="x1")
    check_shape(shape_x2, param_name="x2")

    check_tuple = ("int8", )
    check_dtype(dtype_x1, check_tuple, param_name="x1")
    check_dtype(dtype_x2, check_tuple, param_name="x2")

    shape_x1, shape_x2, shape_max = broadcast_shapes(shape_x1,
                                                     shape_x2,
                                                     param_name_input1="x1",
                                                     param_name_input2="x2")
    dtype = dtype_x1.lower()
    data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype)
    data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype)

    res = logical_or_compute(data_x1, data_x2, y, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "need_build": False,
        "name": kernel_name,
        "tensor_list": (data_x1, data_x2, res)
    }
    te.lang.cce.cce_build_code(schedule, config)
Пример #11
0
def custom_subtract(shape_x,
                    shape_y,
                    dtype,
                    kernel_name="cce_subtract",
                    need_build=True,
                    need_print=True):
    """
    do element-wise subtract operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input data1

    shape_y : shape of input data2

    dtype : source data type, support float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_subtract"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)
    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_subtract_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))
    print("######## shape")
    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)
    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    data1 = tvm.placeholder(shape_x, dtype=dtype, name="data1")
    data2 = tvm.placeholder(shape_y, dtype=dtype, name="data2")

    with tvm.target.cce():
        data1_tmp1 = te.lang.cce.broadcast(data1, shape_max)
        data2_tmp1 = te.lang.cce.broadcast(data2, shape_max)
        res = te.lang.cce.vsub(data1_tmp1, data2_tmp1)
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": [data1, data2, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Пример #12
0
def mul(x, y, output, kernel_name="mul"):
    """
    do element-wise mul operation between two input tensors

    Parameters:
    ----------
    x : dict.
        shape, dtype of input x
    y : dict.
        shape, dtype of input y
    output : dict.
        shape, dtype of ouput
    kernel_name : str.
        cce kernel name, default value is "mul"

    Returns
    -------
    None
    """
    # format_pattern = 1  Nz and vector
    # format_pattern = 2  vector and Nz
    # format_pattern = 0  Nz scalar  Nz Nz  ND ND
    format_pattern = _mul_check_format(x, y)
    shape_x, shape_y = _infer_shape(format_pattern, x, y)

    shape_x = util.scalar2tensor_one(shape_x)
    dtype_x = x.get("dtype").lower()
    shape_y = util.scalar2tensor_one(shape_y)
    dtype_y = y.get("dtype").lower()

    op_utils.check_shape(shape_x, param_name="x")
    op_utils.check_shape(shape_y, param_name="y")

    if dtype_x != dtype_y:
        raise RuntimeError("dtype of inputs should be consistent")
    dtype = dtype_x
    check_list = ("int32", "float16", "float32", "int16")
    op_utils.check_dtype(dtype, check_list, param_name="x")

    vmul_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vmul", "float32")
    if dtype_x == "float32" and not vmul_support:
        raise RuntimeError(
            "Input dtype is float32, but do not support on the platform")

    shape_x, shape_y, shape_max = op_utils.broadcast_shapes(
        shape_x, shape_y, param_name_input1="x", param_name_input2="y")

    shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y)
    input_x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    input_y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    res = _mul_compute(input_x, input_y, output, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)}
    te.lang.cce.cce_build_code(sch, config)
Пример #13
0
def atan_grad(y, dy, z, kernel_name="atan_grad"):
    """
    Gradient calculation for atan(x)

    Parameters:
    ----------
    y : dict of y, include shape and dtype, dtype support float16, float32
    dy : dict of dy, include shape and dtype, dtype support float16, float32
    z : dict of output, include shape and dtype
    kernel_name : cce kernel name, default value is atan_grad

    Algorithm :
    ----------
    forward :
        y = atan(x)
    backward gradient :
        de/dx = dy/dx*de/dy = 1/(1+x^2)*grad

    Returns
    ----------
    None
    """

    # get the shape and dtype
    shape = y.get("shape")
    shape_grad = dy.get("shape")
    dtype = y.get("dtype")
    dtype_grad = dy.get("dtype")

    # check whether kernel name is unique

    # check whether the shape is right
    check_shape(shape, param_name="y")
    check_shape(shape_grad, param_name="dy")
    if not operator.eq(shape, shape_grad):
        raise RuntimeError("all input shape must be the same")
    shape, _ = refine_shape_axes(shape, [])

    # check whether dtypes are fp16,fp32 and whether they are the same
    check_list = ("float16", "float32")
    check_dtype(dtype, check_list, param_name="y")
    check_dtype(dtype_grad, check_list, param_name="dy")
    dtype = dtype.lower()
    if dtype != dtype_grad.lower():
        raise RuntimeError("all input dtype must be same")

    # get 2 input placeholders: data_input, grad
    data_input = tvm.placeholder(shape, name="input_data", dtype=dtype)
    grad = tvm.placeholder(shape, name="input_grad", dtype=dtype)

    # compute the backward gradient
    res = atan_grad_compute(data_input, grad, z, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": [data_input, grad, res]}
    te.lang.cce.cce_build_code(sch, config)
Пример #14
0
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False,
                 need_print=False):
    """
    do element-wise equal operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input x

    shape_y : shape of input y

    dtype : source data type, support float16,float32,int32,int8,uint8

    kernel_name : cce kernel name, default value is "cce_tf_equal"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"]

    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)

    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    x_tmp = te.lang.cce.broadcast(x, shape_max)
    y_tmp = te.lang.cce.broadcast(y, shape_max)

    res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res')

    sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [x, y, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [x, y, res], "cce", name=kernel_name)
Пример #15
0
def gelu_grad(input_dy, input_x, input_y, output_z, kernel_name="gelu_grad"):
    """
    algorithm: gelu_grad
    calculating: dy*res'
    res' = res/x +
           x*0.5*(1 - tanh(math_four)*tanh(math_four))*
           np.sqrt(2 / np.pi)*(1 + 3*0.044715*x2)
    math_four = (np.sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3)))

    Parameters
    ----------
    input_dy : dict
        shape and dtype of dy input, only support float16, float32
    input_x : dict
        shape and dtype of x input, only support float16, float32
    input_y : dict
        shape and dtype of y input, only support float16, float32
    output_z: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is gelu_grad

    Returns:
    -------
    none.
    """
    shape_dy = input_dy.get("shape")
    shape_x = input_x.get("shape")
    shape_y = input_y.get("shape")

    check_shape(shape_dy, param_name="input_dy")
    check_shape(shape_x, param_name="input_x")
    check_shape(shape_y, param_name="input_y")
    input_dtype = input_dy.get("dtype").lower()
    check_list = ("float16", "float32")
    check_dtype(input_dtype, check_list, param_name="input_dy")
    shape_dy = list(shape_dy)
    shape_x = list(shape_x)
    shape_y = list(shape_y)
    if not (operator.eq(shape_dy, shape_x) and operator.eq(shape_dy, shape_y)):
        raise RuntimeError("all input shape must be equal")

    fuseshape = [1]
    fuseshape[0] = reduceIns(lambda x, y: x * y, shape_dy)
    data_dy = tvm.placeholder(fuseshape, name="data_dy", dtype=input_dtype)
    data_x = tvm.placeholder(fuseshape, name="data_x", dtype=input_dtype)
    data_gelu = tvm.placeholder(fuseshape, name="data_gelu", dtype=input_dtype)
    res = gelu_grad_compute(data_dy, data_x, data_gelu, output_z, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_dy, data_x, data_gelu, res]
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #16
0
def softplus_grad(input_gradients,
                  input_features,
                  output_backprops,
                  kernel_name="softplus_grad"):
    """
    Computes softplus gradients for a softplus operation.
    The gradients: "dy * exp(x) / (1 + exp(x))".

    Parameters
    ----------
    input_gradients: dict
        The backpropagated gradients to the corresponding softplus operation.
    input_features: dict
        The input_features passed as input to the corresponding softplus operation.
        source data type support "float16", "float32", "int32", "int8", "uint8".
    output_backprops: dict
        data of output.
    kernel_name: str
        kernel name, default value is "softplus_grad".

    Returns
    -------
    None
    """
    shape_dy = input_gradients.get("shape")
    dtype_dy = input_gradients.get("dtype")
    shape_x = input_features.get("shape")
    dtype_x = input_features.get("dtype")

    if dtype_dy.lower() != dtype_x.lower():
        raise RuntimeError("type of dy and type of x must be same, \
             while the types are different")
    dtype = dtype_dy

    check_shape(shape_dy, param_name="input_gradients")
    check_shape(shape_x, param_name="input_features")

    check_list = ("float16", "float32", "int32", "int8", "uint8")
    input_dtype = dtype.lower()
    check_dtype(input_dtype, check_list, param_name="input_gradients")
    shape_dy, shape_x, shape_max = broadcast_shapes(
        shape_dy,
        shape_x,
        param_name_input1="input_gradients",
        param_name_input2="input_features")
    reshape_dy, reshape_x = refine_shapes_for_broadcast(shape_dy, shape_x)

    data_dy = tvm.placeholder(reshape_dy, name="data_dy", dtype=input_dtype)
    data_x = tvm.placeholder(reshape_x, name="data_x", dtype=input_dtype)

    res = softplus_grad_compute(data_dy,
                                data_x,
                                output_backprops,
                                kernel_name=kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_dy, data_x, res]}
    te.lang.cce.cce_build_code(sch, config)
Пример #17
0
def floor_mod(x1, x2, y, kernel_name="floor_mod"):
    """
    calculate the remainder of division, support fp16,fp32,int32
    res = x1 -floor(input_data_x / input_data_y)* input_data_y

    Parameters
    ----------
    x1: dict
        dict{"shape":tuple or list,"dtype":str}
        shape of data
        the data type, src_dtype equals dst_dtype, support fp16,fp32,int32
    x2: dict
        dict{"shape":tuple or list,"dtype":str}
        shape of data
        the data type, src_dtype equals dst_dtype, support fp16,fp32,int32
    y: dict, reserved field
        dict with keys(shape and dtype) of output
    kernel_name: str
        cce kernel name, default value is "floor_mod"

    Returns
    ------
    None
    """
    # get dtype and shape attributes
    dtype_x = x1.get("dtype").lower()
    shape_x = x1.get("shape")
    dtype_y = x2.get("dtype").lower()
    shape_y = x2.get("shape")

    # check_kernel_name & shape
    check_shape(shape_x, param_name="x1")
    check_shape(shape_y, param_name="x2")

    # check input tensor data_type
    check_list = ("float16", "float32", "int32")
    check_dtype(dtype_x, check_list, param_name="x1")
    check_dtype(dtype_y, check_list, param_name="x2")

    if dtype_x != dtype_y:
        raise RuntimeError("the type of dtype in two dict is not the same")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="x1",
                                                   param_name_input2="x2")
    shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y)

    input_data_x = tvm.placeholder(shape_x, name="input_data_x", dtype=dtype_x)
    input_data_y = tvm.placeholder(shape_y, name="input_data_y", dtype=dtype_y)
    res = floor_mod_compute(input_data_x, input_data_y, y, kernel_name)
    with tvm.target.cce():
        auto_sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [input_data_x, input_data_y, res]
    }
    te.lang.cce.cce_build_code(auto_sch, config)
Пример #18
0
def kl_div(input_x, input_target, output_y, reduction, kernel_name="kl_div"):
    """
    Calcuate Kullback-Leibler divergence.

    output_pos = input_target * (log(input_target) - input_x)
    output = where(input_target > 0, output_pos, zeros)
    reduced = reduce_sum_all(output)
    if reduction = "batchmean":
        final_res = reduce / input.dim[0]
    else:
        final_res = reduced
    Parameters
    ----------
    input_x : dict
        shape and dtype of input_x, dtype only support fp16 and fp32.
    input_target : dict
        shape and dtype of input_target.Shape and dtype must be same as input_x
    output_y : dict
        shape and dtype of output.Dtype must be same as input_x
    reduction: str
        Specifies the reduction to apply to the output:
        reduction="batchmean" or reduction="sum".
        "batchmean": the sum of the output will be divided by the batchsize
        "sum": the output will be summed
    kernel_name : str
        cce kernel name, default value is "kl_div"

    Returns
    ------
    None
    """
    # check input parameter
    _check_parameter(input_x, input_target)

    shape_x = input_x.get("shape")
    dtype_x = input_x.get("dtype")
    batch_size = shape_x[0]
    shape_one_dim = [reduce_one_dim(lambda x, y: x * y, shape_x[:])]
    data_x = tvm.placeholder(shape_one_dim, name="data_x", dtype=dtype_x)
    data_target = tvm.placeholder(shape_one_dim,
                                  name="data_target",
                                  dtype=dtype_x)

    final_res = kl_div_compute(data_x,
                               data_target,
                               output_y,
                               reduction,
                               batch_size,
                               kernel_name=kernel_name)
    with tvm.target.cce():
        auto_sch = generic.auto_schedule(final_res)

    config = {
        "name": kernel_name,
        "tensor_list": (data_x, data_target, final_res)
    }

    te.lang.cce.cce_build_code(auto_sch, config)
Пример #19
0
def fake_learned_scale_quant_perlayer(
        input_x,
        alpha,
        quant_max,
        out,
        neg_trunc,
        kernel_name="fake_learned_scale_quant_perlayer"):
    """FakeLearnedScaleQuantPerLayer"""
    input_shape = input_x.get("shape")
    input_dtype = input_x.get("dtype")
    alpha_shape = alpha.get("ori_shape")
    alpha_dtype = alpha.get("dtype")
    quant_max_shape = quant_max.get("ori_shape")
    quant_max_dtype = quant_max.get("dtype")

    alpha_shape = util.scalar2tensor_one(alpha_shape)
    quant_max_shape = util.scalar2tensor_one(quant_max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(alpha_shape, 1, 1, 1)
    util.check_shape_rule(quant_max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(alpha_shape)
    util.check_tensor_shape_size(quant_max_shape)

    check_list = ["float32", "float16"]
    input_dtype = input_dtype.lower()
    alpha_dtype = alpha_dtype.lower()
    quant_max_dtype = quant_max_dtype.lower()
    util.check_dtype_rule(input_dtype, check_list)
    util.check_dtype_rule(alpha_dtype, check_list)
    util.check_dtype_rule(quant_max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )

    input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
    alpha_data = tvm.placeholder(alpha_shape,
                                 name="alpha_data",
                                 dtype=alpha_dtype)
    quant_max_data = tvm.placeholder(quant_max_shape,
                                     name="quant_max_data",
                                     dtype=quant_max_dtype)
    res = fake_learned_scale_quant_perlayer_compute(input_data, alpha_data,
                                                    quant_max_data, neg_trunc,
                                                    kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, alpha_data, quant_max_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list,
        "bool_storage_as_1bit": False
    }

    te.lang.cce.cce_build_code(sch, config)
Пример #20
0
def matrix_diag_part_d(input_diagonal,
                       input_help,
                       output_diagonal,
                       kernel_name="matrix_diag_part_d"):
    """
    Returns the batched diagonal part of a batched tensor

    Parameters
    ----------
    input_diagonal: dict
        dict of input_diagonal, include keys(shape and dtype)
    input_help: dict
        dict of help Matrix, Its Diagonal Line value is 1 else value is 0
    output_diagonal: dict
        dict of output
    kernel_name: str
        cce kernel name, default value is "matrix_diag_part_d"

    Returns
    -------
    None
    """
    shape_input_diagonal = input_diagonal.get("shape")
    dtype_input_diagonal = input_diagonal.get("dtype")
    shape_input_help = input_help.get("shape")
    dtype_input_help = input_help.get("dtype")

    check_shape(shape_input_diagonal, param_name="input_diagonal")
    check_shape(shape_input_help, param_name="input_help")

    if len(shape_input_diagonal) < 2:
        raise RuntimeError("Input tensors of rank>=2 are supported!")
    if list(shape_input_diagonal) != list(shape_input_help):
        raise RuntimeError("the shape of data must be equal!")

    check_list = ("float16", "float32", "int32", "int8", "uint8")
    dtype_input_diagonal = dtype_input_diagonal.lower()
    check_dtype(dtype_input_diagonal, check_list, param_name="input_diagonal")
    dtype_input_help = dtype_input_help.lower()
    check_dtype(dtype_input_help, check_list, param_name="input_help")

    data_input_diagonal = tvm.placeholder(shape_input_diagonal,
                                          name="data_input_diagonal",
                                          dtype=dtype_input_diagonal)
    data_input_help = tvm.placeholder(shape_input_help,
                                      name="data_input_help",
                                      dtype=dtype_input_help)

    res = matrix_diag_part_d_compute(data_input_diagonal, data_input_help,
                                     output_diagonal, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [data_input_diagonal, data_input_help, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Пример #21
0
def floor_div(input_x, input_y, output_z, kernel_name="floor_div"):
    """
      algorithm: floordiv
      calculating data's floordiv, res =floor(x / y)

      Parameters
      ----------
      input_x: dict
      input_y: dict
      output_z: dict
      kernel_name: str, default value is "floor_div"

      Returns
      -------
      None
    """
    # check dtype of input_x/input_y
    input_dtype_x = input_x.get("dtype").lower()
    input_dtype_y = input_y.get("dtype").lower()
    check_list = ('int8', 'uint8', 'int32', 'float16', 'float32')
    check_dtype(input_dtype_x, check_list, param_name="input_x")
    check_dtype(input_dtype_y, check_list, param_name="input_y")
    check_elewise_shape_range([input_x, input_y], support_broadcast=True)
    if input_dtype_x != input_dtype_y:
        error_info = {}
        error_info['errCode'] = OP_ERROR_CODE_018
        error_info['op_name'] = 'floor_div'
        error_info['param_name1'] = 'input_dtype_x'
        error_info['param_name2'] = 'input_dtype_y'
        error_info['param1_dtype'] = str(input_dtype_x)
        error_info['param2_dtype'] = str(input_dtype_y)
        raise RuntimeError(error_info,
                           "In op[%s], the parameter[%s][%s] are not equal in "
                           "dtype with dtype[%s][%s]." % (
                               error_info['op_name'],
                               error_info['param_name1'],
                               error_info['param_name2'],
                               error_info['param1_dtype'],
                               error_info['param2_dtype']))

    ins = classify([input_x, input_y], Mode.ELEWISE_WITH_BROADCAST)
    schedules, tensors = [], []
    for (input_x, input_y) in ins:
        with te.op.compute():
            x_shape, y_shape = variable_shape([input_x, input_y],
                                              support_broadcast=True)
            x_shape, y_shape = refine_shapes_for_broadcast(x_shape, y_shape)
            tensor_x = tvm.placeholder(x_shape, input_dtype_x, "tensor_x")
            tensor_y = tvm.placeholder(y_shape, input_dtype_y, "tensor_y")
            res = floor_div_compute(tensor_x, tensor_y, output_z, kernel_name)

            tensors.append([tensor_x, tensor_y, res])
        with tvm.target.cce():
            sch = generic.auto_schedule(res)
        schedules.append(sch)

    config = {"name": kernel_name, "tensor_list": tensors}
    te.lang.dynamic.build(schedules, config)
def fake_quant_perchannel(x, min_val, max_val, y,
                          symmetric, narrow_range, num_bits, channel_axis,
                          kernel_name="fake_quant_perchannel"):
    """FakeQuantPerChannel"""
    x_shape = x.get("shape")
    x_shape_ = x.get("ori_shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")
    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
    if channel_axis == 0 and x_shape_[0] != min_shape[0] and x_shape_[1] == min_shape[0]:
        channel_axis_ = 1
    else:
        channel_axis_ = channel_axis
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape_[channel_axis_])
    util.check_shape_rule(max_shape, 1, 1, x_shape_[channel_axis_])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    quant_min = 0
    quant_max = 2 ** num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    shape_c = [1] * len(x_shape)
    shape_c[channel_axis_] = min_val.get("ori_shape")[0]
    if x_format == "NC1HWC0" and channel_axis_ == 1:
        shape_c = min_val.get("shape")
    input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res = fake_quant_perchannel_compute(input_data, min_data, max_data, y,
                                        quant_min, quant_max, symmetric, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, min_data, max_data, res]
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
Пример #23
0
def inv_grad(input_y, input_dy, output_z, kernel_name="inv_grad"):
    """
    algorithm: inv_grad
    calculating data's reciprocal grad,dx = -1*dy*y*y, where `y = 1/x`, and `dy`
    is the corresponding input gradient.

    Parameters
    ----------
    input_y: dict
        shape and dtype of input_y, only support float16, float32, int32, int8
    input_dy: dict
        shape and dtype of input_dy, should be same shape and type as input_y
    output_z: dict
        shape and dtype of output, should be same shape and type as input_y
    kernel_name: str
        kernel name, default value is "inv_grad"

    Returns
    -------
    None
    """
    shape_input_y = input_y.get("shape")
    shape_input_dy = input_dy.get("shape")
    dtype_input_y = input_y.get("dtype")
    dtype_input_dy = input_dy.get("dtype")

    check_shape(shape_input_y, param_name="input_y")
    check_shape(shape_input_dy, param_name="input_dy")

    shape_input_y = util.shape_refine(shape_input_y)
    shape_input_dy = util.shape_refine(shape_input_dy)

    if list(shape_input_y) != list(shape_input_dy):
        raise RuntimeError("the shape of input must be equal!")

    dtype_input_y = dtype_input_y.lower()
    dtype_input_dy = dtype_input_dy.lower()

    if dtype_input_dy != dtype_input_y:
        raise RuntimeError("the dtype of input must be equal!")

    check_list = ("float16", "float32", "int32", "int8")
    check_dtype(dtype_input_y, check_list, param_name="input_y")

    shape_input_dy, shape_input_y = refine_shapes_for_broadcast(shape_input_dy,
                                                                shape_input_y)
    data_dy = tvm.placeholder(shape_input_dy, name="data_dy",
                              dtype=dtype_input_dy)
    data_y = tvm.placeholder(shape_input_y, name="data_y", dtype=dtype_input_y)

    res = inv_grad_compute(data_y, data_dy, output_z, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": [data_y, data_dy, res]}
    te.lang.cce.cce_build_code(sch, config)
Пример #24
0
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"):
    """
    calculate the backpropagation of leaky_relu operation
    y = gradients(x>0) or negative_slope*gradients(x<=0).
    support dtype:float16,float32

    Parameters
    ----------
    g : dict
        the backpropagated gradients to the corresponding leaky_relu operation
    x : dict
        the x passed as output of leaky_relu operation
    y : dict
        the output of leaky_relu back propagation
    negative_slope : float or int
        allow non-zero slope for negative inputs to speed up optimization
    kernel_name : str
        kernel name, default value is "leaky_relu_grad"

    Returns
    -------
    None
    """

    shape_g = g.get("shape")
    shape_x = x.get("shape")
    dtype_g = g.get("dtype").lower()
    dtype_x = x.get("dtype").lower()

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_g)
    util.check_shape_rule(shape_x)
    util.check_tensor_shape_size(shape_g)
    util.check_tensor_shape_size(shape_x)

    shape_list = util.produce_shapes(shape_g, shape_x)
    util.check_tensor_shape_size(shape_list[2])

    # check input tensor data_type
    check_list = ["float16", "float32"]
    util.check_dtype_rule(dtype_g, check_list)
    util.check_dtype_rule(dtype_x, check_list)
    util.compare_tensor_dict_key(g, x, "dtype")

    shape_g, shape_x = refine_shapes_for_broadcast(shape_list[0],
                                                   shape_list[1])
    data_g = tvm.placeholder(shape_g, name="data_g", dtype=dtype_g)
    data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_g)
    res = leaky_relu_grad_compute(data_g, data_x, y, negative_slope,
                                  kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_g, data_x, res]}

    te.lang.cce.cce_build_code(schedule, config)
Пример #25
0
def add(input_x, input_y, output_z, kernel_name="add"):
    """
    algorithm: add
    calculating data's add, c = a + b

    Parameters
    ----------
    input_x : dict
        shape and dtype of first input, only support float16, float32, int32
    input_y : dict
        shape and dtype of second input, only support float16, float32, int32
    output_z: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name : str
        cce kernel name, default value is add

    Returns
    -------
    None
    """
    # format_pattern = 1  Nz and vector
    # format_pattern = 2  vector and Nz
    # format_pattern = 0  Nz scalar  Nz Nz  ND ND
    format_pattern = _add_check_format(input_x, input_y)
    shape_x, shape_y = _infer_shape(format_pattern, input_x, input_y)
    shape_x = util.scalar2tensor_one(shape_x)
    shape_y = util.scalar2tensor_one(shape_y)
    check_shape(shape_x, param_name="input_x")
    check_shape(shape_y, param_name="input_y")

    check_tuple = ("float16", "float32", "int32")
    input_data_type = input_x.get("dtype").lower()
    check_dtype(input_data_type, check_tuple, param_name="input_x")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="input_x",
                                                   param_name_input2="input_y")
    if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1:
        shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1]
        shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1]
        shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1]

    data_x = tvm.placeholder(shape_x, name="data_1", dtype=input_data_type)
    data_y = tvm.placeholder(shape_y, name="data_2", dtype=input_data_type)

    res = add_compute(data_x, data_y, output_z, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": (data_x, data_y, res)
    }
    te.lang.cce.cce_build_code(schedule, config)
Пример #26
0
def assign_sub(var, value, out, kernel_name='assign_sub'):
    """
    Update var by subtracting value from it.

    Parameters:
    ----------
    var : dict
        dict of input_var, include shape and dtype,
        dtype support int8, uint8, int32, float16, float32

    value : dict
        dict of input_value, include shape and dtype,
        dtype support int8, uint8, int32, float16, float32.
        Must have the same shape and dtype as input_var

    out : dict
        dict of out

    kernel_name : str
        cce kernel name, default value is "assign_sub"

    Returns
    -------
    None
    """

    # get the shape and dtype
    shape_var = var.get("shape")
    shape_value = value.get("shape")
    dtype_var = var.get("dtype")
    dtype_value = value.get("dtype")

    # kernel name check: should be unique

    # check whether the shape is right
    check_shape(shape_var, param_name="var")
    check_shape(shape_value, param_name="value")
    if not operator.eq(shape_var, shape_value):
        raise RuntimeError("all input shape must be the equal")

    # check whether dtypes are fp16, fp32, int8, uint8, int32
    # and whether they are the same
    check_list = ("float16", "float32", "int8", "uint8", "int32")
    check_dtype(dtype_var, check_list, param_name="var")
    check_dtype(dtype_value, check_list, param_name="value")
    dtype_var = dtype_var.lower()
    dtype_value = dtype_value.lower()
    if dtype_var != dtype_value:
        raise RuntimeError("all input dtype must be same")

    shape, _ = refine_shape_axes(shape_var, [])
    data_var = tvm.placeholder(shape, dtype=dtype_var, name='data_var')
    data_value = tvm.placeholder(shape, dtype=dtype_value, name='data_value')
    sch, res = _assign_sub_compute(data_var, data_value, out, kernel_name)

    with set_bool_storage_config():
        tvm.build(sch, [data_var, data_value, res], "cce", name=kernel_name)
Пример #27
0
def correction_mul_grad(dout, x, batch_std, running_std, dx, mul_dx, channel, kernel_name="correction_mul_grad"):
    """CorrectionMulGrad op"""
    shape_dout = dout.get("shape")
    shape_x = dout.get("shape")

    dtype_dout = dout.get("dtype")
    dtype_x = x.get("dtype")
    dtype_batch_std = batch_std.get("dtype")
    dtype_running_std = running_std.get("dtype")

    inp_dtype_dout = dtype_dout.lower()
    inp_dtype_x = dtype_x.lower()
    inp_dtype_batch_std = dtype_batch_std.lower()
    inp_dtype_running_std = dtype_running_std.lower()

    util.check_dtype_rule(inp_dtype_dout, ("float16", "float32"))
    util.check_dtype_rule(inp_dtype_x, ("float16", "float32"))
    util.check_dtype_rule(inp_dtype_batch_std, ("float16", "float32"))
    util.check_dtype_rule(inp_dtype_running_std, ("float16", "float32"))
    util.compare_tensor_dict_key(dout, x, "dtype")
    util.compare_tensor_dict_key(dout, x, "shape")
    util.compare_tensor_dict_key(dx, x, "shape")
    util.compare_tensor_dict_key(batch_std, running_std, "shape")
    util.compare_tensor_dict_key(dx, mul_dx, "shape")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)

    data_format = dout.get("format")
    ori_format = dout.get("format")
    if data_format.upper() not in ("NC1HWC0", "NCHW"):
        raise RuntimeError("Un supported data format {}".format(data_format))
    if data_format.upper() == "NCHW" and ori_format != "NCHW":
        raise RuntimeError("data_format(NCHW) must same as ori_format")

    shape_c = [1] * len(shape_x)
    shape_c[channel] = batch_std.get("ori_shape")[0]
    if data_format == "NC1HWC0" and channel == 1:
        shape_c = batch_std.get("shape")

    dout_t = tvm.placeholder(shape_dout, name="dout", dtype=inp_dtype_dout)
    x_t = tvm.placeholder(shape_x, name="x", dtype=inp_dtype_x)
    batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype_batch_std)
    running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype_running_std)
    res_list = correction_mul_grad_compute(dout_t, x_t, batch_std_t, running_std_t, channel, data_format, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + res_list
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
Пример #28
0
def rsqrt_grad(input_y, input_dy, output_z, kernel_name="rsqrt_grad"):
    """
    calculate the backpropagation of rsqrt operation
    rsqrt: y = 1 / sqrt(x)
    rsqrt_grad: -1/2 * y**3 *dy

    Parameters
    ----------
    input_y: dict
        dict of input_y, include keys(shape and dtype)
    input_dy: dict
        dict of input_dy, include keys(shape and dtype)
    output_z: dict
        dict of  output
    kernel_name: str
        cce kernel name, default value is "rsqrt_grad"

    Returns
    -------
    None
    """
    shape_input_y = input_y.get("shape")
    dtype_input_y = input_y.get("dtype")
    shape_input_dy = input_dy.get("shape")
    dtype_input_dy = input_dy.get("dtype")

    check_shape(shape_input_y, param_name="input_y")
    check_shape(shape_input_dy, param_name="input_dy")
    util.compare_tensor_dict_key(input_y, input_dy, "shape")

    check_list = ("float16", "float32", "int32", "int8")
    dtype_input_y = dtype_input_y.lower()
    check_dtype(dtype_input_y, check_list, param_name="input_y")
    dtype_input_dy = dtype_input_dy.lower()
    check_dtype(dtype_input_dy, check_list, param_name="input_dy")
    util.compare_tensor_dict_key(input_y, input_dy, "dtype")
    reshape_y, reshape_dy = refine_shapes_for_broadcast(
        shape_input_y, shape_input_dy)

    data_input_y = tvm.placeholder(reshape_y,
                                   name="data_input_y",
                                   dtype=dtype_input_y)
    data_input_dy = tvm.placeholder(reshape_dy,
                                    name="data_input_dy",
                                    dtype=dtype_input_dy)

    res = rsqrt_grad_compute(data_input_y, data_input_dy, output_z,
                             kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [data_input_y, data_input_dy, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Пример #29
0
def fused_mul_add_n(input_x,
                    input_y,
                    input_z,
                    output,
                    kernel_name="fused_mul_add_n"):
    """
    algorithm: fused mul+add_n
    calculating output = input_x * input_z + input_y

    Parameters
    ----------
    input_x : dict of input_x, tensor
    input_y: dict of input_y, tensor
    input_z: dict of input_z, scalar
    output : dict of output

    kernel_name : string
        cce kernel name, default value is fused_mul_add_n

    Returns
    -------
    None
    """

    check_list = ("float16", "float32", "int32", "int16")
    shape_x = input_x.get("shape")
    dtype_x = input_x.get("dtype")
    op_utils.check_shape(shape_x, param_name="input_x")
    op_utils.check_dtype(dtype_x, check_list, param_name="input_x")
    shape_y = input_y.get("shape")
    dtype_y = input_y.get("dtype")
    op_utils.check_shape(shape_y, param_name="input_y")
    op_utils.check_dtype(dtype_y, check_list, param_name="input_y")
    dtype_z = input_z.get("dtype")
    shape_z = [1 for i in range(len(shape_x))]
    op_utils.check_shape(shape_z, param_name="input_z")
    op_utils.check_dtype(dtype_z, check_list, param_name="input_z")

    data_x = tvm.placeholder(shape_x, name="input_x", dtype=dtype_x)
    data_y = tvm.placeholder(shape_y, name="input_y", dtype=dtype_y)
    data_z = tvm.placeholder(shape_z, name="input_z", dtype=dtype_z)

    res = mul_add_n_compute(data_x, data_y, data_z)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    tensor_list = [data_x, data_y, data_z, res]

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(schedule, config)
Пример #30
0
def real_div(x1, x2, y, kernel_name="real_div"):
    """
    algorithm: real_div
    calculating data's real_div, c = a / b

    Parameters
    ----------
    x1 : dict
        shape and dtype of first input, only support float16, float32, int32
    x2 : dict
        shape and dtype of second input, only support float16, float32, int32
    y: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name : str
        cce kernel name, default value is real_div

    Returns
    -------
    None
    """
    shape_x = util.scalar2tensor_one(x1.get("shape"))
    shape_y = util.scalar2tensor_one(x2.get("shape"))
    check_shape(shape_x, param_name="x1")
    check_shape(shape_y, param_name="x2")

    check_tuple = ("float16", "float32")
    input_data_type = x1.get("dtype").lower()
    check_dtype(input_data_type, check_tuple, param_name="x1")
    input_data_type_x2 = x2.get("dtype").lower()
    check_dtype(input_data_type_x2, check_tuple, param_name="x2")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="x1",
                                                   param_name_input2="x2")
    if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1:
        shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1]
        shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1]
        shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1]

    shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y)
    data_x = tvm.placeholder(shape_x, name="data_x", dtype=input_data_type)
    data_y = tvm.placeholder(shape_y, name="data_y", dtype=input_data_type)

    res = real_div_compute(data_x, data_y, y, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": (data_x, data_y, res)
    }

    te.lang.cce.cce_build_code(schedule, config)