Exemplo n.º 1
0
def unpack(x, y, num=None, axis=0, kernel_name="unpack"):
    """
    unpacks the given dimension of a rank R tensor into rank (R-1) tensors.

    Parameters
    ----------
    x : dict.
        shape, dtype and format of value to be unpacked.
    y: tuple or list
        the list of output tensor.
    num : int.
        the length of the dim axis, automatically inferred if None(default).
    axis: int.
        the axis to unpack along.
    kernel_name : str
        cce kernel name, default value is "unpack".

    Returns
    -------
    None
    """
    shape = x.get("shape")
    dtype = x.get("dtype").lower()
    format = x.get("format")
    _check_params(shape, num, axis, format, dtype, kernel_name)

    # infer the value of num
    real_axis = axis + len(shape) if axis < 0 else axis
    num = shape[real_axis]

    # turn the input shape into three dimensions (a, b, c), so axis = 1
    beferdim = 1
    for befer_dim in shape[0:real_axis]:
        beferdim *= befer_dim
    afterdim = 1
    for after_dim in shape[real_axis + 1:]:
        afterdim *= after_dim
    reshape = (beferdim, shape[real_axis], afterdim)

    _, _, is_use_split = check_use_special_optimize(dtype,
                                                    afterdim,
                                                    flag=False)
    reshape_input = x.copy()
    reshape_input["shape"] = reshape
    real_axis = 1
    # only 1 output tensor, so output equals to input
    if num == 1:
        copy_only(reshape_input, reshape_input, kernel_name)
    # use split
    elif is_use_split:
        split_d(reshape_input,
                y,
                split_dim=real_axis,
                num_split=num,
                kernel_name=kernel_name)
    else:
        new_dtype, afterdim, _ = check_use_special_optimize(dtype,
                                                            afterdim,
                                                            flag=False)
        new_shape = (beferdim, reshape[real_axis], afterdim)

        input_place = tvm.placeholder(new_shape,
                                      name="input_place",
                                      dtype=new_dtype)
        sch, build_list = _unpack_schedule(input_place, reshape, y, num,
                                           real_axis, dtype)

        with build_config:
            tvm.build(sch, build_list, "cce", name=kernel_name)
Exemplo n.º 2
0
def space_to_batch_nd_d(x,
                        y,
                        block_shape,
                        paddings,
                        kernel_name="space_to_batch_nd_d"):
    """space_to_batch for tensor.

    Parameters
    ----------
    x: dict
        the dict of input tensor.
    y: dict
        the dict of output tensor.
    block_shape: list or tuple
        1-D with shape [2].
    paddings: list or tuple
        2-D with shape [2, 2], paddings[i] = [pad_start, pad_end].
    kernel_name: str
        cce kernel name, default value is "space_to_batch_nd_d".

    Returns
    -------
    None.
    """
    shape = x.get("shape")
    dtype = x.get("dtype").lower()
    input_format = x.get("format")
    ori_format = x.get("ori_format")

    if input_format not in ("NC1HWC0",):
        raise RuntimeError("The input_format must be NC1HWC0.")

    if ori_format in ("NHWC",):
        if len(paddings) == 4:
            paddings = [[paddings[0], paddings[1]], [paddings[2], paddings[3]]]
    elif ori_format in ("NCHW",):
        if len(block_shape) == 3 and block_shape[0] == 1:
            block_shape = [block_shape[1], block_shape[2]]
        else:
            raise RuntimeError("The value of first block_shape must be 1")
        if len(paddings) == 6 and paddings[0] == 0 and paddings[1] == 0:
            paddings = [[paddings[2], paddings[3]], [paddings[4], paddings[5]]]
        elif len(paddings) == 3 and len(paddings[0]) == 2 and len(paddings[1]) == 2 \
                and len(paddings[2]) == 2 and paddings[0][0] == 0 and paddings[0][1] == 0:
            paddings = [[paddings[1][0], paddings[1][1]], [paddings[2][0], paddings[2][1]]]
        else:
            raise RuntimeError("The value of first paddings must be 0")
    else:
        raise RuntimeError("The ori_format is not supported")

    _check_parms(shape, dtype, block_shape, paddings, kernel_name)

    if block_shape[0] == 1 and block_shape[1] == 1 and paddings[0][
            0] == 0 and paddings[0][1] == 0 and paddings[1][
                0] == 0 and paddings[1][1] == 0:
        copy_only(x, x, kernel_name)
        return

    if paddings[0][0] == 0 and paddings[0][1] == 0 and \
            paddings[1][0] == 0 and paddings[1][1] == 0:
        new_shape_input = \
            (shape[0], shape[1], shape[2] // block_shape[0], block_shape[0],
             shape[3] // block_shape[1], block_shape[1], shape[4])
        new_shape_output = \
            (block_shape[0], block_shape[1], shape[0], shape[1], shape[2] //
             block_shape[0], shape[3] // block_shape[1], shape[4])
        x.update({"shape": new_shape_input})
        y.update({"shape": new_shape_output})
        transpose_d(x, y, [3, 5, 0, 1, 2, 4, 6], kernel_name)
        return

    data = tvm.placeholder(shape, name="data", dtype=dtype)

    res = space_to_batch_nd_d_compute(data, y, block_shape, paddings,
                                      kernel_name)

    sch = tvm.create_schedule(res.op)

    with build_config:
        tvm.build(sch, [data, res], "cce", name=kernel_name)
Exemplo n.º 3
0
def split_v_d(input_value,
              output_data,
              size_splits,
              split_dim,
              num_split,
              kernel_name="split_v_d"):
    """Split a tensor into len(size_splits) tensors along one dimension.

    Parameters
    ----------
    input_value: dict
        the dict of input tensor.
    output_data: list or tuple
        the list of output tensor.
    size_splits: list or tuple
        a Python list containing the sizes of each output tensor
        along `split_dim`.
    split_dim: int
        the dimension along which to split_d.
    num_split: int
        used to specify the number of outputs.
    kernel_name: str
        cce kernel name, default value is "split_v_d".

    Returns
    -------
    None.
    """
    input_format = input_value.get("format")
    ori_format = input_value.get("ori_format")
    if input_format == "NC1HWC0":
        split_dim = util.axis_transfrom_5d(split_dim, ori_format)
        split_with_5hd_not_align = \
            SplitWith5HD(input_value, output_data,
                         split_dim, num_split, kernel_name)
        if split_with_5hd_not_align.check_5hd_vnchw():
            split_with_5hd_not_align.do_5hd_split_cut_by_batch()
            return
        if split_dim == 1:
            size_splits = list(size_splits)
            size_splits = [size // 16 for size in size_splits]

    shape = input_value.get("shape")
    dtype = input_value.get("dtype")
    dtype_lower = dtype.lower()
    check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16",
                  "uint32", "uint64", "float16", "float32")

    check_shape(shape, param_name="input_value")
    check_dtype(dtype_lower, check_list, param_name="input_value")

    shape_len = len(shape)
    split_dim = util.axis_check(shape_len, split_dim)

    dim = shape[split_dim]
    if len(size_splits) + 1 == num_split or len(size_splits) == 0:
        spilt_list = []
        split_sum = 0
        if len(size_splits) != 0:
            for i, _ in enumerate(size_splits):
                spilt_list.append(size_splits[i])
                split_sum = split_sum + size_splits[i]
            if dim - split_sum > 0:
                spilt_list.append(dim - split_sum)
        else:
            batch = dim / num_split
            for i in range(0, num_split):
                spilt_list.append(int(batch))
        size_splits = spilt_list

    size_splits = list(size_splits)
    size_splits_sum = 0
    for size in size_splits:
        if size != -1:
            size_splits_sum += size
    if dim != size_splits_sum:
        for i, _ in enumerate(size_splits):
            if _ == -1:
                size_splits[i] = dim - size_splits_sum

    size_sum = 0
    for size in size_splits:
        if size < 1:
            raise RuntimeError(
                "The size (%d) of size_splits must be greater or equal to %d" %
                (size, 1))
        size_sum = size_sum + size
    if size_sum != shape[split_dim]:
        raise RuntimeError(
            "The sum size (%d) of size_splits must be equal to the length of "
            "split_dim (%d)" % (size_sum, shape[split_dim]))
    if len(size_splits) != num_split:
        raise RuntimeError(
            "The length (%d) of size_splits must be equal to num_split(%d)" %
            (len(size_splits), num_split))

    if num_split == 1:
        copy_only(input_value, input_value, kernel_name)
        return

    split_mov = SplitMov(shape, dtype_lower, split_dim, num_split, size_splits,
                         kernel_name)
    new_shape = split_mov.input_shape
    new_split_dim = split_mov.split_dim
    new_size_splits = split_mov.size_splits
    new_output_shapes = split_mov.output_shapes
    input_size = functools_reduce(lambda x, y: x * y, new_shape)
    last_dim_same = True
    input_last_dim = new_output_shapes[0][-1]
    for i, _ in enumerate(new_output_shapes):
        if input_last_dim != new_output_shapes[i][-1]:
            last_dim_same = False
            break

    if dtype_lower == "float16" and new_split_dim == len(new_shape) - 1 and \
            last_dim_same and new_size_splits[0] == 1 and num_split <= 16 \
            and input_size >= TRANSPOSE_SIZE * num_split:
        split_vnc = SplitLastDimVnv(new_shape, dtype_lower, new_output_shapes,
                                    new_split_dim, num_split, kernel_name)
        split_vnc.split_last_dim_vnc_compute()
        return

    if check_use_last_dim_branch(new_shape, dtype_lower, new_split_dim,
                                 num_split, new_size_splits):
        split_last_dim(new_shape, dtype_lower, new_split_dim, num_split,
                       new_size_splits, kernel_name)
        return

    if split_mov.check_whether_use_split_mov():
        split_mov.split_mov_compute()
        return

    data = tvm.placeholder(shape, name="data", dtype=dtype_lower)
    output_shape_list, output_tensor_list = split_v_d_compute(
        data, output_data, size_splits, split_dim, num_split,
        kernel_name)

    sch, build_list = te.lang.cce.split_schedule_com(data, split_dim,
                                                     output_shape_list,
                                                     output_tensor_list)

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)
Exemplo n.º 4
0
def split_d(input_value,
            output_data,
            split_dim,
            num_split,
            kernel_name="split_d"):
    """Split a tensor into `num_split` tensors along one dimension.

    Parameters
    ----------
    input_value: dict
        the dict of input tensor.
    output_data: list or tuple
        the list of output tensor.
    split_dim: int
        the dimension along which to split_d.
    num_split: int
        an integer indicating the number of split_d along `split_dim`.
    kernel_name: str
        cce kernel name, default value is "split_d".

    Returns
    -------
    None.
    """
    input_format = input_value.get("format")
    ori_format = input_value.get("ori_format")
    if input_format == "NC1HWC0":
        split_dim = util.axis_transfrom_5d(split_dim, ori_format)

    shape = input_value.get("shape")
    dtype = input_value.get("dtype")
    dtype_lower = dtype.lower()
    check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16",
                  "uint32", "uint64", "float16", "float32")

    check_shape(shape, param_name="input_value")
    check_dtype(dtype_lower, check_list, param_name="input_value")

    shape_len = len(shape)
    split_dim = util.axis_check(shape_len, split_dim)

    if num_split < 1:
        raise RuntimeError(
            "The num_split (%d) must be greater or equal to %d" %
            (num_split, 1))

    split_with_5hd_not_align = \
        SplitWith5HD(input_value, output_data,
                     split_dim, num_split, kernel_name)
    if split_with_5hd_not_align.check_5hd_vnchw():
        split_with_5hd_not_align.do_5hd_split_cut_by_batch()
        return

    if shape[split_dim] % num_split != 0:
        raise RuntimeError(
            "The num_split (%d) must be divisible by the length of"
            "split_dim (%d)" % (num_split, shape[split_dim]))

    if num_split == 1:
        copy_only(input_value, input_value, kernel_name)
        return

    split_mov = SplitMov(shape, dtype_lower, split_dim, num_split, None,
                         kernel_name)
    new_shape = split_mov.input_shape
    new_split_dim = split_mov.split_dim
    new_size_splits = split_mov.size_splits
    new_output_shapes = split_mov.output_shapes
    input_size = functools_reduce(lambda x, y: x * y, new_shape)

    if dtype_lower == "float16" and new_split_dim == len(new_shape) - 1 and \
            new_size_splits[0] == 1 and num_split <= 16 \
            and input_size >= TRANSPOSE_SIZE * num_split:
        split_vnc = SplitLastDimVnv(new_shape, dtype_lower, new_output_shapes,
                                    new_split_dim, num_split, kernel_name)
        split_vnc.split_last_dim_vnc_compute()
        return

    if check_use_last_dim_branch(new_shape, dtype_lower, new_split_dim,
                                 num_split, new_size_splits):
        split_last_dim(new_shape, dtype_lower, new_split_dim, num_split,
                       new_size_splits, kernel_name)
        return

    if split_mov.check_whether_use_split_mov():
        split_mov.split_mov_compute()
        return

    data = tvm.placeholder(shape, name="data", dtype=dtype_lower)
    output_shape_list, output_tensor_list = split_d_compute(
        data, output_data, split_dim, num_split, kernel_name)

    sch, build_list = te.lang.cce.split_schedule_com(data, split_dim,
                                                     output_shape_list,
                                                     output_tensor_list)

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)