Exemplo n.º 1
0
 def __build(self, op, attr_name, attr_value):
     infos = NoSqlAttributeInspector.inspect(self._entity_class,
                                             attr_name)
     is_nested = len(infos) > 1
     parent_type = self._entity_class
     exprs = []
     for info in infos[:-1]:
         nested_attr_kind, nested_attr_type, nested_attr_name = info
         root_coll = get_root_collection(parent_type)
         templ_map = dict(root_collection_name=root_coll.__name__,
                          nested_attr_name=nested_attr_name,
                          )
         if nested_attr_kind == RESOURCE_ATTRIBUTE_KINDS.MEMBER:
             expr = self.__nested_member_query_template % templ_map
         else: # RESOURCE_ATTRIBUTE_KINDS.COLLECTION
             # FIXME: Test this.
             # expr = self.__nested_collection_query_template % templ_map
             raise NotImplementedError('Not implemented.') # pragma: no cover
         exprs.insert(0, expr)
         parent_type = nested_attr_type
     terminal_attr_name = infos[-1][-1]
     expr = self.__prepare_criterion(terminal_attr_name, op, attr_value)
     if is_nested:
         # FIXME: Need to handle value -> string conversion here.
         root_coll = get_root_collection(parent_type)
         templ_map = dict(root_collection_name=root_coll.__name__,
                          terminal_expression=str(expr))
         terminal_expr = self.__nested_terminal_query_template % templ_map
         expr = func_reduce(lambda g, h: h % dict(nested_expression=g),
                            exprs, terminal_expr)
     return expr
Exemplo n.º 2
0
 def __build(self, attribute_name, sql_op, *values):
     # Builds an SQL expression from the given (possibly dotted)
     # attribute name, SQL operation name, and values.
     exprs = []
     infos = OrmAttributeInspector.inspect(self._entity_class,
                                           attribute_name)
     count = len(infos)
     for idx, info in enumerate(infos):
         kind, entity_attr = info
         if idx == count - 1:
             #
             args = \
                 [val.get_entity() if IResource.providedBy(val) else val # pylint: disable=E1101
                  for val in values]
             expr = getattr(entity_attr, sql_op)(*args)
         elif kind == RESOURCE_ATTRIBUTE_KINDS.MEMBER:
             expr = entity_attr.has
             exprs.insert(0, expr)
         elif kind == RESOURCE_ATTRIBUTE_KINDS.COLLECTION:
             expr = entity_attr.any
             exprs.insert(0, expr)
     return func_reduce(lambda g, h: h(g), exprs, expr)
Exemplo n.º 3
0
 def __build(self, attribute_name, sql_op, *values):
     # Builds an SQL expression from the given (possibly dotted)
     # attribute name, SQL operation name, and values.
     exprs = []
     infos = OrmAttributeInspector.inspect(self._entity_class,
                                           attribute_name)
     count = len(infos)
     for idx, info in enumerate(infos):
         kind, entity_attr = info
         if idx == count - 1:
             #
             args = \
                 [val.get_entity() if IResource.providedBy(val) else val # pylint: disable=E1101
                  for val in values]
             expr = getattr(entity_attr, sql_op)(*args)
         elif kind == RESOURCE_ATTRIBUTE_KINDS.MEMBER:
             expr = entity_attr.has
             exprs.insert(0, expr)
         elif kind == RESOURCE_ATTRIBUTE_KINDS.COLLECTION:
             expr = entity_attr.any
             exprs.insert(0, expr)
     return func_reduce(lambda g, h: h(g), exprs, expr)
Exemplo n.º 4
0
def _multi_core_on_n(tik_inst, data_in, data_out, shape_in):
    """
    do ndhwc to fractal_z_3d transfer by multiple core on axis n
    """

    axis_n, axis_d, axis_h, axis_w, axis_c = shape_in
    hw_size = axis_h * axis_w
    ni_no_size = _ceil_div(axis_n, C0_LEN) * C0_LEN
    dhwc_size = func_reduce(lambda x, y: x * y, shape_in[1:])
    axis_c1 = _ceil_div(axis_c, C0_LEN)
    dtype_factor = _get_dtype_factor(data_in.dtype)
    dhwc_align_size = func_reduce(lambda x, y: x * y,
                                  (axis_d, hw_size, axis_c1, C0_LEN))
    # deduct 32 Bytes to avoid the repeat times of MTE bigger than 4095
    half_ub_size = (UB_SIZE - 4 * BLOCK_BYTE_SIZE) // 2 // dtype_factor
    is_c1dhwc0_bigger_half_ub = dhwc_align_size > half_ub_size

    # each core process certain dhwc lines
    core_num = _ceil_div(axis_n, _ceil_div(axis_n, CORE_NUM))
    per_core_n_cnt = _ceil_div(axis_n, core_num)
    left_n_cnt = axis_n - per_core_n_cnt * (core_num - 1)
    # count how many n lines need to pad zero
    zero_loop_cnt = (_ceil_div(axis_n, C0_LEN) * C0_LEN - axis_n) // core_num
    zero_line_left = (_ceil_div(axis_n, C0_LEN) * C0_LEN - axis_n) % core_num

    if axis_c % C0_LEN:
        ub_size = half_ub_size
        # this case will do data moving in ub
        out_ub = tik_inst.Tensor(data_in.dtype, (ub_size, ),
                                 name="out_ub",
                                 scope=tik.scope_ubuf)
    else:
        ub_size = UB_SIZE // dtype_factor

    # alloc input and output ub
    in_ub = tik_inst.Tensor(data_in.dtype, (ub_size, ),
                            name="in_ub",
                            scope=tik.scope_ubuf)
    # used for scalar operation
    reg_list = [tik_inst.Scalar(data_in.dtype) for i in REG_IDX_LIST]

    with tik_inst.for_range(0, core_num, block_num=core_num) as block_idx:

        # pylint: disable=too-many-locals,too-many-statements
        def _n_transfer_process(n_len):
            """
            process of hw transfer
            """

            with tik_inst.for_range(0, n_len) as n_idx:

                def _transfer_4_dhwc_less_half_ub():
                    """
                    the transfer process for dhwc less than half ub
                    """

                    input_offset = (block_idx * per_core_n_cnt +
                                    n_idx) * dhwc_size
                    tik_inst.data_move(
                        in_ub, data_in[input_offset], 0, 1,
                        _ceil_div(dhwc_size, BLOCK_BYTE_SIZE // dtype_factor),
                        0, 0)

                    if axis_c % C0_LEN:
                        with tik_inst.if_scope(n_idx == 0):
                            _clean_ubuf(tik_inst, out_ub, 0, dhwc_align_size)
                        _padding_short_c(tik_inst, out_ub, in_ub, axis_c,
                                         axis_d * hw_size, reg_list)

                    with tik_inst.for_range(0, axis_d) as d_idx:
                        with tik_inst.for_range(0, axis_c1) as c1_idx:
                            output_offset = (block_idx * per_core_n_cnt +
                                             n_idx +
                                             (d_idx * axis_c1 + c1_idx) *
                                             hw_size * ni_no_size) * C0_LEN
                            mid_offset = (d_idx * hw_size * axis_c1 +
                                          c1_idx) * C0_LEN

                            if axis_c % C0_LEN:
                                tik_inst.data_move(
                                    data_out[output_offset],
                                    out_ub[mid_offset], 0, hw_size,
                                    dtype_factor // 2,
                                    (axis_c1 - 1) * dtype_factor // 2,
                                    (ni_no_size - 1) * dtype_factor // 2)
                            else:
                                tik_inst.data_move(
                                    data_out[output_offset], in_ub[mid_offset],
                                    0, hw_size, dtype_factor // 2,
                                    (axis_c1 - 1) * dtype_factor // 2,
                                    (ni_no_size - 1) * dtype_factor // 2)

                def _transfer_4_dhwc_larger_half_ub():
                    """
                    the transfer process for dhwc larger than half ub
                    """

                    with tik_inst.for_range(0, axis_d) as d_idx_1:
                        with tik_inst.for_range(0, hw_size) as hw_idx_1:

                            def _inner_c_transfer(sub_c, in_offset,
                                                  out_offset):
                                """
                                the transfer for axis c
                                """

                                tik_inst.data_move(
                                    in_ub, data_in[in_offset], 0, 1,
                                    _ceil_div(sub_c,
                                              BLOCK_BYTE_SIZE // dtype_factor),
                                    0, 0)

                                c0_cnt_in_sub_c = _ceil_div(sub_c, C0_LEN)
                                repeat_stride = hw_size * ni_no_size * dtype_factor // 2
                                hwninoc0_size = hw_size * ni_no_size * C0_LEN
                                if sub_c % C0_LEN:
                                    with tik_inst.if_scope(n_idx == 0):
                                        _clean_ubuf(tik_inst, out_ub, 0,
                                                    c0_cnt_in_sub_c * C0_LEN)
                                    _padding_long_c(tik_inst, out_ub, in_ub,
                                                    sub_c, reg_list)
                                    temp_ub = out_ub
                                else:
                                    temp_ub = in_ub

                                if c0_cnt_in_sub_c <= 4095 and repeat_stride <= 65536:
                                    tik_inst.data_move(
                                        data_out[out_offset], temp_ub, 0,
                                        c0_cnt_in_sub_c, dtype_factor // 2, 0,
                                        repeat_stride - dtype_factor // 2)
                                else:
                                    with tik_inst.for_range(
                                            0, c0_cnt_in_sub_c) as c0_idx:
                                        tik_inst.data_move(
                                            data_out[out_offset +
                                                     c0_idx * hwninoc0_size],
                                            temp_ub[c0_idx * C0_LEN], 0, 1,
                                            dtype_factor // 2, 0, 0)

                            if axis_c < ub_size:
                                input_offset_1 = (
                                    (block_idx * per_core_n_cnt + n_idx) *
                                    dhwc_size +
                                    (d_idx_1 * hw_size + hw_idx_1) * axis_c)
                                output_offset_1 = (
                                    block_idx * per_core_n_cnt + n_idx +
                                    (d_idx_1 * axis_c1 * hw_size + hw_idx_1) *
                                    ni_no_size) * C0_LEN
                                _inner_c_transfer(axis_c, input_offset_1,
                                                  output_offset_1)

                            else:
                                c_lp_cnt = axis_c // ub_size
                                c_left = axis_c % ub_size
                                with tik_inst.for_range(0,
                                                        c_lp_cnt) as c_lp_idx:
                                    input_offset_1 = (
                                        (block_idx * per_core_n_cnt + n_idx) *
                                        dhwc_size +
                                        (d_idx_1 * hw_size + hw_idx_1) * axis_c
                                        + c_lp_idx * ub_size)
                                    output_offset_1 = (
                                        block_idx * per_core_n_cnt + n_idx +
                                        (d_idx_1 * axis_c1 * hw_size +
                                         hw_idx_1) * ni_no_size + c_lp_idx *
                                        (ub_size // C0_LEN) * hw_size *
                                        ni_no_size) * C0_LEN
                                    _inner_c_transfer(ub_size, input_offset_1,
                                                      output_offset_1)
                                if c_left:
                                    input_offset_1 = (
                                        (block_idx * per_core_n_cnt + n_idx) *
                                        dhwc_size +
                                        (d_idx_1 * hw_size + hw_idx_1) * axis_c
                                        + c_lp_cnt * ub_size)
                                    output_offset_1 = (
                                        block_idx * per_core_n_cnt + n_idx +
                                        (d_idx_1 * axis_c1 * hw_size +
                                         hw_idx_1) * ni_no_size + c_lp_cnt *
                                        (ub_size // C0_LEN) * hw_size *
                                        ni_no_size) * C0_LEN
                                    _inner_c_transfer(c_left, input_offset_1,
                                                      output_offset_1)

                if is_c1dhwc0_bigger_half_ub:
                    _transfer_4_dhwc_larger_half_ub()
                else:
                    _transfer_4_dhwc_less_half_ub()

        with tik_inst.if_scope(block_idx == core_num - 1):
            _n_transfer_process(left_n_cnt)
        with tik_inst.else_scope():
            _n_transfer_process(per_core_n_cnt)

        if axis_n % C0_LEN:

            def _padding_ni_no_process(z_lp_cnt, z_lp_index):
                """
                set the left size in one ninoc0 cube to zero
                """
                hwc0_size = hw_size * C0_LEN
                if hwc0_size > ub_size:
                    _clean_ubuf(tik_inst, in_ub, 0, ub_size)
                    hw_lp_cnt = hwc0_size // ub_size
                    repeat_time = ub_size // C0_LEN
                    hw_left_size = hwc0_size % ub_size // C0_LEN
                else:
                    _clean_ubuf(tik_inst, in_ub, 0, hw_size * C0_LEN)
                    hw_lp_cnt = 1
                    repeat_time = hw_size
                    hw_left_size = 0

                def _padding_ni_no_cube(out_offset):
                    """
                    set the left size in one ninoc0 cube to zero
                    """
                    with tik_inst.for_range(0, hw_lp_cnt) as hw_lp_idx:
                        tik_inst.data_move(
                            data_out[out_offset +
                                     hw_lp_idx * ni_no_size * C0_LEN], in_ub,
                            0, repeat_time, dtype_factor // 2, 0,
                            (ni_no_size - 1) * dtype_factor // 2)
                    if hw_left_size:
                        tik_inst.data_move(
                            data_out[out_offset +
                                     hw_lp_cnt * ni_no_size * C0_LEN], in_ub,
                            0, hw_left_size, dtype_factor // 2, 0,
                            (ni_no_size - 1) * dtype_factor // 2)

                with tik_inst.for_range(0, axis_d) as d_idx:
                    with tik_inst.for_range(0, axis_c1) as c1_idx:
                        output_offset = \
                            (axis_n + block_idx * z_lp_cnt + z_lp_index +
                             (d_idx * axis_c1 + c1_idx) * hw_size * ni_no_size) * C0_LEN
                        _padding_ni_no_cube(output_offset)

            if zero_loop_cnt:
                with tik_inst.for_range(0, zero_loop_cnt) as z_lp_idx:
                    _padding_ni_no_process(zero_loop_cnt, z_lp_idx)

            if zero_line_left:
                with tik_inst.if_scope(block_idx < zero_line_left):
                    _padding_ni_no_process(1, core_num * zero_loop_cnt)
Exemplo n.º 5
0
def _multi_core_on_n(tik_inst, data_in, data_out, shape_in):
    """
    do ncdhw to fractal_z_3d transfer by multiple core on axis n
    """

    axis_n, axis_c, axis_d, axis_h, axis_w = shape_in
    hw_size = axis_h * axis_w
    ni_no_size = _ceil_div(axis_n, C0_LEN) * C0_LEN
    cdhw_size = func_reduce(lambda x, y: x * y, shape_in[1:])
    axis_c1 = _ceil_div(axis_c, C0_LEN)
    dtype_factor = _get_dtype_factor(data_in.dtype)

    # each core process certain cdhw lines
    core_num = _ceil_div(axis_n, _ceil_div(axis_n, CORE_NUM))
    per_core_n_cnt = _ceil_div(axis_n, core_num)
    left_n_cnt = axis_n - per_core_n_cnt * (core_num - 1)
    # to count the padding lines
    zero_loop_cnt = (_ceil_div(axis_n, C0_LEN) * C0_LEN - axis_n) // core_num
    zero_line_left = (_ceil_div(axis_n, C0_LEN) * C0_LEN - axis_n) % core_num

    # to check whether the half UB can hold cdhw or not
    dtype_factor = _get_dtype_factor(data_in.dtype)
    out_size = func_reduce(
        lambda x, y: x * y,
        (axis_c1 * C0_LEN, axis_d, axis_h, axis_w, dtype_factor))
    # to avoid the repeat times of MTE bigger than 4095
    is_chwd_less_half_ub = out_size <= ((UB_SIZE - 4 * BLOCK_BYTE_SIZE) // 2)
    # used to scalar conv
    reg_list = [tik_inst.Scalar(data_in.dtype) for i in REG_IDX_LIST]

    if is_chwd_less_half_ub:
        # split the UB into two parts, and to load cdhw each time
        ub_size = (UB_SIZE - 4 * BLOCK_BYTE_SIZE) // 2 // dtype_factor
    else:
        # to adapt the vnchwconv command
        hw_align_c0_mul_16 = _ceil_div(hw_size, C0_LEN) * C0_LEN * 16
        # split the UB into two parts, and to make sure the ub_size is align with C0_LEN
        ub_size = _get_vnchwconv_ub_size(hw_align_c0_mul_16 * dtype_factor,
                                         2) // dtype_factor // C0_LEN * C0_LEN
        ub_col_size = ub_size // 16 // C0_LEN * C0_LEN
        if ub_col_size == 0:
            raise RuntimeError("The UB is too small!")

    # alloc input and output ub
    in_ub = tik_inst.Tensor(data_in.dtype, (ub_size, ),
                            name="in_ub",
                            scope=tik.scope_ubuf)
    out_ub = tik_inst.Tensor(data_in.dtype, (ub_size, ),
                             name="out_ub",
                             scope=tik.scope_ubuf)

    with tik_inst.for_range(0, core_num, block_num=core_num) as block_idx:

        # pylint: disable=too-many-locals,too-many-statements
        def _n_transfer_process(n_len):
            """
            process of hw transfer
            """

            c0_count_in_c = axis_c // C0_LEN
            c_left = axis_c % C0_LEN

            if is_chwd_less_half_ub:
                dhw_align_size = _ceil_div(axis_d * hw_size, C0_LEN) * C0_LEN
                dhw_size = axis_d * hw_size

                def _get_mv_in_para():
                    """
                    to count how many cdhw can be loaded in one time
                    """

                    nc1dhwc0_cnt = ub_size // (axis_c1 * axis_d * hw_size *
                                               C0_LEN)
                    if n_len <= nc1dhwc0_cnt:
                        mv_in_lp = 1
                        len_new = n_len
                        len_left = 0
                    else:
                        mv_in_lp = n_len // nc1dhwc0_cnt
                        len_new = nc1dhwc0_cnt
                        len_left = n_len % nc1dhwc0_cnt

                    return mv_in_lp, len_left, len_new

                def _cdhw_less_half_ub_process(mv_in_lp_index, sub_n_len):
                    """
                    process of cdhw less than half UB
                    """

                    input_offset = (block_idx * per_core_n_cnt +
                                    mv_in_lp_index * n_len_new) * cdhw_size
                    # move in xcdhw size each time
                    tik_inst.data_move(
                        in_ub, data_in[input_offset], 0, 1,
                        _ceil_div(sub_n_len * cdhw_size,
                                  BLOCK_BYTE_SIZE // dtype_factor), 0, 0)

                    if axis_c % C0_LEN:
                        # set the dst ub to zero to avoid dirty data
                        with tik_inst.if_scope(mv_in_lp_index == 0):
                            _clean_ubuf(tik_inst, out_ub, 0, ub_size)
                    # do transpose from xcdhw to c1dhwxc0
                    axis_param_1 = (sub_n_len, axis_c, axis_d * hw_size,
                                    axis_d * hw_size, C0_LEN)
                    _scalar_conv(tik_inst, out_ub, in_ub, axis_param_1,
                                 reg_list)

                    with tik_inst.for_range(0, axis_d) as d_idx:
                        with tik_inst.for_range(0, axis_c1) as c1_idx:
                            output_offset = (block_idx * per_core_n_cnt +
                                             mv_in_lp_index * n_len_new +
                                             (d_idx * axis_c1 + c1_idx) *
                                             hw_size * ni_no_size) * C0_LEN
                            mid_offset = (c1_idx * axis_d +
                                          d_idx) * hw_size * sub_n_len * C0_LEN
                            # move out hwc0 each time
                            tik_inst.data_move(
                                data_out[output_offset], out_ub[mid_offset], 0,
                                hw_size, sub_n_len * dtype_factor // 2, 0,
                                (ni_no_size - sub_n_len) * dtype_factor // 2)

                def _cdhw_less_half_ub_process_fp16(n_index):
                    """
                    process of cdhw less than half UB for fp16
                    """
                    def _inner_process_fp16(c0_index, c_lines):
                        """
                        vnchwconv for c0dhw
                        """

                        with tik_inst.for_range(0, c_lines) as c_idx:
                            input_offset = (
                                (block_idx * per_core_n_cnt + n_index) *
                                cdhw_size +
                                (c0_index * C0_LEN + c_idx) * dhw_size)
                            tik_inst.data_move(in_ub[c_idx * dhw_align_size],
                                               data_in[input_offset], 0, 1,
                                               _ceil_div(dhw_size,
                                                         C0_LEN), 0, 0)
                        # do vnchwconv
                        src_addr_list = [
                            in_ub[dhw_align_size * i] for i in ADDR_IDX_LIST
                        ]
                        dst_addr_list = [
                            out_ub[C0_LEN * i] for i in ADDR_IDX_LIST
                        ]
                        repeat_cnt = _ceil_div(dhw_size, C0_LEN)
                        src_stride = 0 if repeat_cnt == 1 else 1
                        dst_stride = 0 if repeat_cnt == 1 else 16
                        tik_inst.vnchwconv(False, False, dst_addr_list,
                                           src_addr_list, repeat_cnt,
                                           dst_stride, src_stride)

                        # move data out in d times
                        with tik_inst.for_range(0, axis_d) as d2_idx:
                            output_offset = (block_idx * per_core_n_cnt +
                                             n_index +
                                             (d2_idx * axis_c1 + c0_index) *
                                             hw_size * ni_no_size) * C0_LEN
                            tik_inst.data_move(
                                data_out[output_offset],
                                out_ub[d2_idx * hw_size * C0_LEN], 0, hw_size,
                                1, 0, ni_no_size - 1)

                    if c0_count_in_c:
                        with tik_inst.for_range(0, c0_count_in_c) as c0_idx:
                            _inner_process_fp16(c0_idx, C0_LEN)
                        if c_left:
                            _clean_ubuf(tik_inst, in_ub,
                                        c_left * dhw_align_size,
                                        (16 - c_left) * dhw_align_size)
                            _inner_process_fp16(c0_count_in_c, c_left)
                    else:
                        with tik_inst.if_scope(n_index == 0):
                            _clean_ubuf(tik_inst, in_ub,
                                        c_left * dhw_align_size,
                                        (16 - c_left) * dhw_align_size)
                        _inner_process_fp16(0, c_left)

                if (data_in.dtype.lower() == "float16"
                        and dhw_align_size * C0_LEN <= ub_size
                        and dhw_size >= C0_LEN):
                    with tik_inst.for_range(0, n_len) as n_idx_1:
                        _cdhw_less_half_ub_process_fp16(n_idx_1)
                else:
                    n_mv_in_lp, n_len_left, n_len_new = _get_mv_in_para()
                    with tik_inst.for_range(0, n_mv_in_lp) as mv_in_lp_idx:
                        _cdhw_less_half_ub_process(mv_in_lp_idx, n_len_new)
                    if n_len_left:
                        _cdhw_less_half_ub_process(n_mv_in_lp, n_len_left)

            else:
                with tik_inst.for_range(0, n_len) as n_idx:

                    def _cdhw_bigger_half_ub_process():
                        """
                        process of cdhw bigger than half UB
                        """
                        def _c0hw_hwc0_transfer(c0_index, sub_c_count):
                            """
                            do transpose from c0hw to hwc0
                            """

                            with tik_inst.for_range(0, axis_d) as d1_idx:

                                def _inner_process(loop_index, hw_len):
                                    """
                                    inner process of the transpose
                                    """

                                    # move in hw_len block in 16 times
                                    input_offset_1 = (
                                        (block_idx * per_core_n_cnt + n_idx) *
                                        cdhw_size +
                                        c0_index * 16 * axis_d * hw_size +
                                        d1_idx * hw_size +
                                        loop_index * ub_col_size)
                                    with tik_inst.for_range(
                                            0, sub_c_count) as sub_c_idx:
                                        tik_inst.data_move(
                                            in_ub[sub_c_idx * ub_col_size],
                                            data_in[input_offset_1 +
                                                    sub_c_idx * axis_d *
                                                    hw_size], 0, 1,
                                            _ceil_div(
                                                hw_len, BLOCK_BYTE_SIZE //
                                                dtype_factor), 0, 0)

                                    if data_in.dtype.lower() == "float16":
                                        # do vnchwconv transfer
                                        src_addr_list = [
                                            in_ub[ub_col_size * i]
                                            for i in ADDR_IDX_LIST
                                        ]
                                        dst_addr_list = [
                                            out_ub[C0_LEN * i]
                                            for i in ADDR_IDX_LIST
                                        ]
                                        repeat_cnt = _ceil_div(hw_len, C0_LEN)
                                        src_stride = 0 if repeat_cnt == 1 else 1
                                        dst_stride = 0 if repeat_cnt == 1 else 16
                                        tik_inst.vnchwconv(
                                            False, False, dst_addr_list,
                                            src_addr_list, repeat_cnt,
                                            dst_stride, src_stride)
                                    else:
                                        axis_param = (1, sub_c_count, hw_len,
                                                      ub_col_size, C0_LEN)
                                        _scalar_conv(tik_inst, out_ub, in_ub,
                                                     axis_param, reg_list)

                                    # move out hw_len block each time
                                    output_offset_1 = (
                                        block_idx * per_core_n_cnt + n_idx +
                                        ((d1_idx * axis_c1 + c0_index) *
                                         hw_size + loop_index * ub_col_size) *
                                        ni_no_size) * C0_LEN
                                    tik_inst.data_move(
                                        data_out[output_offset_1], out_ub, 0,
                                        hw_len, dtype_factor // 2, 0,
                                        (ni_no_size - 1) * dtype_factor // 2)

                                is_hw_less_ub_col_size = hw_size <= ub_col_size
                                if is_hw_less_ub_col_size:
                                    _inner_process(0, hw_size)
                                else:
                                    buf_loop = hw_size // ub_col_size
                                    hw_left = hw_size % ub_col_size

                                    if buf_loop:
                                        with tik_inst.for_range(
                                                0, buf_loop) as buf_lp_idx:
                                            _inner_process(
                                                buf_lp_idx, ub_col_size)
                                        if hw_left:
                                            _inner_process(buf_loop, hw_left)

                        if c0_count_in_c:
                            with tik_inst.for_range(0,
                                                    c0_count_in_c) as c0_idx:
                                _c0hw_hwc0_transfer(c0_idx, C0_LEN)
                            if c_left:
                                # to avoid dirty data
                                if data_in.dtype.lower() == "float16":
                                    _clean_ubuf(
                                        tik_inst, in_ub, c_left * ub_col_size,
                                        (C0_LEN - c_left) * ub_col_size)
                                else:
                                    _clean_ubuf(tik_inst, out_ub, 0, ub_size)
                                _c0hw_hwc0_transfer(c0_count_in_c, c_left)
                        else:
                            # only need to clean buf once
                            with tik_inst.if_scope(n_idx == 0):
                                # to avoid dirty data
                                if data_in.dtype.lower() == "float16":
                                    _clean_ubuf(
                                        tik_inst, in_ub, c_left * ub_col_size,
                                        (C0_LEN - c_left) * ub_col_size)
                                else:
                                    _clean_ubuf(tik_inst, out_ub, 0, ub_size)
                            _c0hw_hwc0_transfer(0, c_left)

                    _cdhw_bigger_half_ub_process()

        with tik_inst.if_scope(block_idx == core_num - 1):
            _n_transfer_process(left_n_cnt)
        with tik_inst.else_scope():
            _n_transfer_process(per_core_n_cnt)

        if axis_n % C0_LEN:
            if is_chwd_less_half_ub:
                _clean_ubuf(tik_inst, out_ub, 0, hw_size * C0_LEN)
                buf_loop_cnt = 0
                hw_left_size = 0
            else:
                _clean_ubuf(tik_inst, out_ub, 0, ub_size)
                buf_loop_cnt = hw_size // ub_col_size
                hw_left_size = hw_size % ub_col_size

            def _padding_ni_no_cube(output_offset_z):
                """
                set the left size in one ninoc0 cube to zero
                """
                if buf_loop_cnt == 0 or (hw_left_size == 0
                                         and buf_loop_cnt == 1):
                    tik_inst.data_move(data_out[output_offset_z], out_ub, 0,
                                       hw_size, dtype_factor // 2, 0,
                                       (ni_no_size - 1) * dtype_factor // 2)
                else:
                    with tik_inst.for_range(0, buf_loop_cnt) as lp_idx:
                        tik_inst.data_move(
                            data_out[output_offset_z + lp_idx * ub_col_size *
                                     ni_no_size * C0_LEN], out_ub, 0,
                            ub_col_size, dtype_factor // 2, 0,
                            (ni_no_size - 1) * dtype_factor // 2)
                    if hw_left_size:
                        tik_inst.data_move(
                            data_out[output_offset_z + buf_loop_cnt *
                                     ub_col_size * ni_no_size * C0_LEN],
                            out_ub, 0, hw_left_size, dtype_factor // 2, 0,
                            (ni_no_size - 1) * dtype_factor // 2)

            if zero_loop_cnt:
                with tik_inst.for_range(0, zero_loop_cnt) as z_lp_idx:
                    with tik_inst.for_range(0, axis_d) as d_idx:
                        with tik_inst.for_range(0, axis_c1) as c1_idx:
                            output_offset_zero = (
                                axis_n + block_idx * zero_loop_cnt + z_lp_idx +
                                (d_idx * axis_c1 + c1_idx) * hw_size *
                                ni_no_size) * C0_LEN
                            _padding_ni_no_cube(output_offset_zero)

            if zero_line_left:
                with tik_inst.if_scope(block_idx < zero_line_left):
                    with tik_inst.for_range(0, axis_d) as d_idx:
                        with tik_inst.for_range(0, axis_c1) as c1_idx:
                            output_offset_zero = (
                                axis_n + core_num * zero_loop_cnt + block_idx +
                                (d_idx * axis_c1 + c1_idx) * hw_size *
                                ni_no_size) * C0_LEN
                            _padding_ni_no_cube(output_offset_zero)
Exemplo n.º 6
0
 def _disjunction_op(self, spec, *expressions):
     return func_reduce(operator_or, expressions)
Exemplo n.º 7
0
 def _conjunction_op(self, spec, *expressions):
     return func_reduce(operator_and, expressions)
Exemplo n.º 8
0
 def _disjunction_op(self, spec, *expressions):
     return func_reduce(operator_or, expressions)
Exemplo n.º 9
0
 def _conjunction_op(self, spec, *expressions):
     return func_reduce(operator_and, expressions)
Exemplo n.º 10
0
 def _conjunction_op(self, spec, *expressions):
     res = func_reduce(and_operator, expressions)
     return res