def LinearNd(
        out_data  # ndim x out_size
    ,
        in_data  # ndim x in_size
    ,
        weight  # out_size x in_size
    ,
        bias=None  # out_size
    ,
        rigor=False,
        verbose=False):
    """
    Returns True on success, otherwize returns False
    Applies a 1D matrix multiplication over an input data data.
    Note that all nd-array lists are NumPy (mutable), not PyTorch tensor (immutable).
    :param out_data: <mutable> output data, out_data[ndim][out_size]
    :param in_data: input data, in_data[ndim][in_size]
    :param weight: weight[out_size][in_size]
    :param bias: bias for each output, bias[out_size]
    :param rigor: check values rigorously when 'True'
    :param verbose: output message more when 'True'
    :return: 'True' on success, 'False' on failure.
    Follwoings are derived from input arguments
    . ndim: first dimension of out/in_data
    . out_size: array size of out_data
    . in_size: array size of in_data
    . weight_size: dimension of weight
    . bias_size: array size of bias
    Following is an example usage for PyTorch.
        LinearNd( tensor_out_data.data.numpy() # ndim x out_size
                    , tenso_in_data.data.numpy()   # ndim x in_size
                    , tensor_weight.data.numpy()   # out_size x in_size
                    , tensor_bias.data.numpy()     # out_size
                    , rigor=True
                    , verbose=True)
    """
    if rigor:
        error = 0
        if (out_data.ndim != 2):
            error += 1
            if verbose:
                dlr_common.DpuError("out_data is not 1 dim", flush=True)
        if (in_data.ndim != 2):
            error += 1
            if verbose: dlr_common.DpuError("in_data is not 1 dim", flush=True)
        if (weight.ndim != 2):
            error += 1
            if verbose: dlr_common.DpuError("weight is not 2 dim", flush=True)
        if (bias is not None) and (bias.ndim != 1):
            error += 1
            if verbose:
                dlr_common.DpuError(f"bias should be 1 dim: {bias.ndim}",
                                    flush=True)
        t_out_ndim = out_data.shape[0]
        t_out_size = out_data.shape[1]  # note ndim (i.e., rank) is 1
        t_in_ndim = in_data.shape[0]
        t_in_size = in_data.shape[1]  # note ndim (i.e., rank) is 1
        t_weight_size_row = weight.shape[0]  # note ndim (i.e., rank) is 2
        t_weight_size_col = weight.shape[1]  # note ndim (i.e., rank) is 2
        if (t_out_ndim != t_in_ndim):
            error += 1
            dlr_common.DpuError(f"dimension mis-match", flush=True)
        if (t_out_size != t_weight_size_row):
            error += 1
            dlr_common.DpuError(f"out_size mis-match", flush=True)
        if (t_in_size != t_weight_size_col):
            error += 1
            dlr_common.DpuError(f"out_size mis-match", flush=True)
        if verbose:
            dlr_common.DpuInfo(f"out_size   ={t_out_size} {out_data.shape}")
            dlr_common.DpuInfo(f"in_size    ={t_in_size} {in_data.shape}")
            dlr_common.DpuInfo(
                f"weight_size={t_weitht_dize_row} {t_weight_size_col}")
        if (error != 0):
            dlr_common.DpuError(" parameter mis-match", flush=True)
            return False
    #_fname=''
    #_ctype=''
    if out_data.dtype.type == np.int32:
        _fname = 'LinearNdInt'
        _ctype = ctypes.c_int
    elif out_data.dtype.type == np.float32:
        _fname = 'LinearNdFloat'
        _ctype = ctypes.c_float
    elif out_data.dtype.type == np.float64:
        _fname = 'LinearNdDouble'
        _ctype = ctypes.c_double
    else:
        dlr_common.DpuError(" not support " + str(out_data.dtype.type),
                            flush=True)
        return False
    _LinearNd = dlr_common.WrapFunction(
        dlr_common._dlr,
        _fname,
        None  # return type
        ,
        [
            ctypes.POINTER(_ctype)  # out data
            ,
            ctypes.POINTER(_ctype)  # in data
            ,
            ctypes.POINTER(_ctype)  # weight
            ,
            ctypes.POINTER(_ctype)  # bias
            ,
            ctypes.c_ushort  # out_size
            ,
            ctypes.c_ushort  # in_size
            ,
            ctypes.c_ushort  # bias_size
            ,
            ctypes.c_ubyte  # ndim
            ,
            ctypes.c_int  # rigor
            ,
            ctypes.c_int
        ])  # verbose
    CP_out_data = out_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_in_data = in_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_weight = weight.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_out_size = ctypes.c_ushort(
        out_data.shape[1])  # note ndim (i.e., rank) is 2
    CP_in_size = ctypes.c_ushort(
        in_data.shape[1])  # note ndim (i.e., rank) is 2
    CP_ndim = ctypes.c_ubyte(in_data.shape[0])  # note ndim (i.e., rank) is 2
    CP_rigor = 1 if rigor else 0
    CP_verbose = 1 if verbose else 0
    if (bias is None) or (bias.size == 0):
        CP_bias = ctypes.POINTER(_ctype)()
        CP_bias_size = ctypes.c_ushort(0)
    else:
        CP_bias = bias.ctypes.data_as(ctypes.POINTER(_ctype))
        CP_bias_size = ctypes.c_ushort(bias.shape[0])
    _LinearNd(CP_out_data, CP_in_data, CP_weight, CP_bias, CP_out_size,
              CP_in_size, CP_bias_size, CP_ndim, CP_rigor, CP_verbose)
    return True
def Pooling2dMax(
        out_data  # out_channel x out_size x out_size
    ,
        in_data  # in_channel x in_size x in_size
    ,
        kernel_size  # kernel_size x kernel_size
    ,
        stride=1,
        padding=0,
        ceil_mode=False,
        rigor=False,
        verbose=False):
    """
    Returns True on success, otherwize returns False
    Applies a 2D mAXpolling over an input data composed of several input channels.
    Note that all nd-array lists are NumPy (mutable), not PyTorch tensor (immutable).
    :param out_data: <mutable> output data, out_data[out_channel][out_size][out_size]
    :param in_data: input data, in_data[in_channel][in_size][in_size]
    :param kernel_size:
    :param stride: num of skips to apply next filter
    :param padding: num of pixes at the boundary
    :param ceil_mode: use floor() when false, otherwize ceil()
    :param rigor: check values rigorously when 'True'
    :param verbose: output message more when 'True'
    :return: 'True' on success, 'False' on failure.
    Follwoings are derived from input arguments
    . out_size: array size of out_data
    . in_size: array size of in_data
    . in_chnannels: num of input channels
    . out_channels: num of output channels (it should be the same as in_channel)
    Following is an example usage for PyTorch.
        Pooling2dMax( tensor_out_data.data.numpy() # out_channel x out_size x out_size
                        , tenso_in_data.data.numpy()   # in_channel x in_size x in_size
                        , kernel_size
                        , stride
                        , padding
                        , rigor=True
                        , verbose=True)
    """
    if rigor:
        error = 0
        if (out_data.ndim != 3):
            error += 1
            if verbose: dlr_common.DpuError("out_data is not 3 dim")
        if (in_data.ndim != 3):
            error += 1
            if verbose: dlr_common.DpuError("in_data is not 3 dim")
        if (kernel_size < 2):
            error += 1
            if verbose: dlr_common.DpuError("kernel_size should be >=2")
        if (stride < 1):
            error += 1
            if verbose: dlr_common.DpuError("stride should be >=1")
        if (padding < 0):
            error += 1
            if verbose: dlr_common.DpuError("stride should be >=0")
        t_out_size = out_data.shape[2]  # note ndim (i.e., rank) is 3
        t_in_size = in_data.shape[2]  # note ndim (i.e., rank) is 3
        t_kernel_size = kernel_size
        t_in_channel = in_data.shape[0]
        t_out_channel = out_data.shape[0]
        t_stride = stride
        t_padding = padding
        if (t_in_channel != t_out_channel):
            error += 1
            if verbose:
                dlr_common.DpuError("in/out channel should be the same")
        status, t_out_size_expect = GetOutputSizeOfPooling2dMax(
            t_in_size, t_kernel_size, t_stride, t_padding)
        if not status: return False  # something wrong with arguments
        if (t_out_size != t_out_size_expect):
            error += 1
            if verbose:
                dlr_common.DpuError(
                    f"out_size mis-match {t_out_size} {t_out_size_expect}")
        if ((t_kernel_size % 2) == 1):
            error += 1
            if verbose: dlr_common.DpuError(f"kernel_size should be even")
        if verbose:
            dlr_common.DpuInfo(f"out_channel={t_out_channel} {out_data.shape}")
            dlr_common.DpuInfo(f"in_channel ={t_in_channel} {in_data.shape}")
            dlr_common.DpuInfo(f"out_size   ={t_out_size} {out_data.shape}")
            dlr_common.DpuInfo(f"in_size    ={t_in_size} {in_data.shape}")
            dlr_common.DpuInfo(f"kernel_size={t_kernel_size}")
            dlr_common.DpuInfo(f"stride     ={t_stride} {stride}")
            dlr_common.DpuInfo(f"padding    ={t_padding} {padding}")
        if (error != 0):
            dlr_common.DpuError("parameter mis-match")
            return False
    #_fname=''
    #_ctype=''
    if out_data.dtype.type == np.int32:
        _fname = 'Pooling2dMaxInt'
        _ctype = ctypes.c_int
    elif out_data.dtype.type == np.float32:
        _fname = 'Pooling2dMaxFloat'
        _ctype = ctypes.c_float
    elif out_data.dtype.type == np.float64:
        _fname = 'Pooling2dMaxDouble'
        _ctype = ctypes.c_double
    else:
        dlr_common.DpuError("not support " + str(out_data.dtype.type))
        return False
    _Pooling2dMax = dlr_common.WrapFunction(
        dlr_common._dlr,
        _fname,
        None  # return type
        ,
        [
            ctypes.POINTER(_ctype)  # output features
            ,
            ctypes.POINTER(_ctype)  # input image
            ,
            ctypes.c_ushort  # out_size
            ,
            ctypes.c_ushort  # in_size
            ,
            ctypes.c_ubyte  # kernel_size (only for square filter)
            ,
            ctypes.c_ushort  # channel
            ,
            ctypes.c_ubyte  # stride
            ,
            ctypes.c_ubyte  # padding
            ,
            ctypes.c_int  # ceil_mode
            ,
            ctypes.c_int  # rigor
            ,
            ctypes.c_int  # verbose
        ])
    CP_out_data = out_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_in_data = in_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_out_size = ctypes.c_ushort(
        out_data.shape[2])  # note ndim (i.e., rank) is 3
    CP_in_size = ctypes.c_ushort(
        in_data.shape[2])  # note ndim (i.e., rank) is 3
    CP_kernel_size = ctypes.c_ubyte(kernel_size)
    CP_channel = ctypes.c_ushort(in_data.shape[0])
    CP_stride = ctypes.c_ubyte(stride)
    CP_padding = ctypes.c_ubyte(padding)
    CP_ceil_mode = 1 if ceil_mode else 0
    CP_rigor = 1 if rigor else 0
    CP_verbose = 1 if verbose else 0

    _Pooling2dMax(CP_out_data, CP_in_data, CP_out_size, CP_in_size,
                  CP_kernel_size, CP_channel, CP_stride, CP_padding,
                  CP_ceil_mode, CP_rigor, CP_verbose)
    return True
def Deconvolution2d(
        out_data  # out_channel x out_size x out_size
    ,
        in_data  # in_channel x in_size x in_size
    ,
        kernel  # in_channel x out_channel x kernel_size x kernel_size
    ,
        bias=None  # out_channel
    ,
        stride=1,
        padding=0,
        rigor=False,
        verbose=False):
    """
    Returns True on success, otherwize returns False
    Applies a 2D deconvolution (transpose convolution) over an input data composed of several input channels.
    Note that all nd-array lists are NumPy (mutable), not PyTorch tensor (immutable).
    :param out_data: <mutable> output data, out_data[out_channel][out_size][out_size]
    :param in_data: input data, in_data[in_channel][in_size][in_size]
    :param kernel: kernel (or filter), kernel[in_channel][out_channel][kernel_size][kernel_size]
    :param bias: bias for each filter (kernel), bias[out_channel]
    :param stride: num of skips to apply next filter
    :param padding: num of pixes at the boundary
    :param rigor: check values rigorously when 'True'
    :param verbose: output message more when 'True'
    :return: 'True' on success, 'False' on failure.
    Follwoings are derived from input arguments
    . out_size: array size of out_data
    . in_size: array size of in_data
    . kernel_size: dimension of filter, e.g., 3 means 3x3 kernel
    . in_chnannels: num of input channels, e.g., 3 for RGB, 1 for gray
    . out_channels: num of filters
    . bias_size: array size of bias
    Following is an example usage for PyTorch.
        deconvolution2d( tensor_out_data.data.numpy()
                       , tenso_in_data.data.numpy()
                       , tensor_kernel.data.numpy()
                       , tensor_bias.data.numpy()
                       , stride
                       , padding
                       , rigor=True
                       , verbose=True)
    """
    if rigor or dlr_common.rigor:
        error = 0
        if (out_data.ndim != 3):
            error += 1
            if verbose:
                dlr_common.DpuError("out_data is not 3 dim", flush=True)
        if (in_data.ndim != 3):
            error += 1
            if verbose: dlr_common.DpuError("in_data is not 3 dim", flush=True)
        if (kernel.ndim != 4):
            error += 1
            if verbose: dlr_common.DpuError("kernel is not 4 dim", flush=True)
        if (bias is not None) and (bias.ndim != 1):
            error += 1
            if verbose:
                dlr_common.DpuError(f"bias should be 1 dim: {bias.ndim}",
                                    flush=True)
        if (stride < 1):
            error += 1
            if verbose:
                dlr_common.DpuError(f"stride should be >=1: {stride}",
                                    flush=True)
        if (padding < 0):
            error += 1
            if verbose:
                dlr_common.DpuError(f"padding should be >=0: {padding}",
                                    flush=True)
        t_out_size = out_data.shape[2]  # note ndim (i.e., rank) is 3
        t_in_size = in_data.shape[2]  # note ndim (i.e., rank) is 3
        t_kernel_size = kernel.shape[3]  # note ndim (i.e., rank) is 4
        t_in_channel = in_data.shape[0]
        t_out_channel = out_data.shape[0]
        t_stride = stride
        t_padding = padding
        status, t_out_size_expect = GetOutputSizeOfDeconvolution2d(
            in_size=t_in_size,
            kernel_size=t_kernel_size,
            stride=t_stride,
            padding=t_padding,
            output_padding=0,
            dilation=1,
            rigor=rigor,
            verbose=verbose)
        if not status: return False  # something wrong with arguments
        if (t_out_size != t_out_size_expect):
            error += 1
            dlr_common.DpuError(
                f"out_size mis-match: {t_out_size, t_out_size_expect}",
                flush=True)
        if ((t_kernel_size % 2) != 1):
            error += 1
            dlr_common.DpuError(f"kernel_size should be odd: {t_kernel_size}",
                                flush=True)
        if verbose:
            dlr_common.DpuInfo(f"out_channel={t_out_channel} {out_data.shape}")
            dlr_common.DpuInfo(f"in_channel ={t_in_channel} {in_data.shape}")
            dlr_common.DpuInfo(f"out_size   ={t_out_size} {out_data.shape}")
            dlr_common.DpuInfo(f"in_size    ={t_in_size} {in_data.shape}")
            dlr_common.DpuInfo(f"kernel_size={t_kernel_size} {kernel.shape}")
            dlr_common.DpuInfo(f"stride     ={t_stride} {stride}")
            dlr_common.DpuInfo(f"padding    ={t_padding} {padding}")
        if (error != 0):
            dlr_common.DpuError(" parameter mis-match", flush=True)
            return False
    #_fname=''
    #_ctype=''
    if out_data.dtype.type == np.int32:
        _fname = 'Deconvolution2dInt'
        _ctype = ctypes.c_int
    elif out_data.dtype.type == np.float32:
        _fname = 'Deconvolution2dFloat'
        _ctype = ctypes.c_float
    elif out_data.dtype.type == np.float64:
        _fname = 'Deconvolution2dDouble'
        _ctype = ctypes.c_double
    else:
        dlr_common.DpuError(" not support " + str(out_data.dtype.type),
                            flush=True)
        return False
    _Deconv2d = dlr_common.WrapFunction(
        dlr_common._dlr,
        _fname,
        None  # return type
        ,
        [
            ctypes.POINTER(_ctype)  # output features
            ,
            ctypes.POINTER(_ctype)  # input image
            ,
            ctypes.POINTER(_ctype)  # kernels
            ,
            ctypes.POINTER(_ctype)  # bias
            ,
            ctypes.c_ushort  # out_size
            ,
            ctypes.c_ushort  # in_size
            ,
            ctypes.c_ubyte  # kernel_size (only for square filter)
            ,
            ctypes.c_ushort  # bias_size
            ,
            ctypes.c_ushort  # in_channel
            ,
            ctypes.c_ushort  # out_channel
            ,
            ctypes.c_ubyte  # stride
            ,
            ctypes.c_ubyte  # padding
            ,
            ctypes.c_int  # rigor
            ,
            ctypes.c_int
        ])  # verbose
    CP_out_data = out_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_in_data = in_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_kernel = kernel.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_out_size = ctypes.c_ushort(
        out_data.shape[2])  # note ndim (i.e., rank) is 3
    CP_in_size = ctypes.c_ushort(
        in_data.shape[2])  # note ndim (i.e., rank) is 3
    CP_kernel_size = ctypes.c_ubyte(
        kernel.shape[3])  # note ndim (i.e., rank) is 4
    CP_in_channel = ctypes.c_ushort(in_data.shape[0])  # kernel.shape[0]
    CP_out_channel = ctypes.c_ushort(kernel.shape[1])
    CP_stride = ctypes.c_ubyte(stride)
    CP_padding = ctypes.c_ubyte(padding)
    CP_rigor = 1 if rigor else 0
    CP_verbose = 1 if verbose else 0
    if (bias is None) or (bias.size == 0):
        CP_bias = ctypes.POINTER(_ctype)()
        CP_bias_size = ctypes.c_ushort(0)
    else:
        CP_bias = bias.ctypes.data_as(ctypes.POINTER(_ctype))
        CP_bias_size = ctypes.c_ushort(bias.shape[0])
    _Deconv2d(CP_out_data, CP_in_data, CP_kernel, CP_bias, CP_out_size,
              CP_in_size, CP_kernel_size, CP_bias_size, CP_in_channel,
              CP_out_channel, CP_stride, CP_padding, CP_rigor, CP_verbose)
    return True
def Norm3dBatch(
        out_data  # in_channel x <N dimemsion>
    ,
        in_data  # in_channel x <N dimemsion>
    ,
        running_mean  # in_channel
    ,
        running_var  # in_channel
    ,
        scale=None  # None or in_channel
    ,
        bias=None  # None or in_channel    
    ,
        epsilon=1E-5,
        rigor=False,
        verbose=False):
    """
    Returns True on success, otherwize returns False
    Applies a 1D matrix multiplication over an input data data.
    Note that all nd-array lists are NumPy (mutable), not PyTorch tensor (immutable).
    :param out_data: <mutable> output data, out_data[ndim][out_size]
    :param in_data: input data, in_data[ndim][in_size]
    :param weight: weight[out_size][in_size]
    :param bias: bias for each output, bias[out_size]
    :param rigor: check values rigorously when 'True'
    :param verbose: output message more when 'True'
    :return: 'True' on success, 'False' on failure.
    Follwoings are derived from input arguments
    . ndim: first dimension of out/in_data
    . out_size: array size of out_data
    . in_size: array size of in_data
    . weight_size: dimension of weight
    . bias_size: array size of bias
    Following is an example usage for PyTorch.
        Norm2dBatch( tensor_out_data.data.numpy() # ndim x out_size
                       , tenso_in_data.data.numpy()   # ndim x in_size
                       , tensor_running_mean.data.numpy()   # out_size x in_size
                       , tensor_running_var.data.numpy()     # out_size
                       , tensor_scale.data.numpy()     # out_size
                       , tensor_bias.data.numpy()     # out_size
                       , epsilon
                       , rigor=True
                       , verbose=True)
    """
    if rigor:
        error = 0
        if (out_data.ndim != in_data.ndim):
            error += 1
            if verbose:
                dlr_common.DpuError("out_data in_data dimension mis-match",
                                    flush=True)
        if (running_mean.ndim != 1):
            error += 1
            if verbose:
                dlr_common.DpuError("running_mean dimension mis-match",
                                    flush=True)
        if (running_mean.size != in_data.shape[0]):
            error += 1
            if verbose:
                dlr_common.DpuError("running_mean size mis-match", flush=True)
        if (running_var.ndim != 1):
            error += 1
            if verbose:
                dlr_common.DpuError("running_var dimension mis-match",
                                    flush=True)
        if (running_var.size != in_data.shape[0]):
            error += 1
            if verbose:
                dlr_common.DpuError("running_var size mis-match", flush=True)
        if (scale is not None) and (scale.ndim != 1):
            error += 1
            if verbose:
                dlr_common.DpuError(f"scale should be 1 dim: {scale.ndim}",
                                    flush=True)
        if (bias is not None) and (bias.ndim != 1):
            error += 1
            if verbose:
                dlr_common.DpuError(f"bias should be 1 dim: {bias.ndim}",
                                    flush=True)
        t_out_channel = out_data.shape[0]
        t_out_size = out_data.size / t_out_channel
        t_in_channel = in_data.shape[0]
        t_in_size = in_data.size / t_in_channel
        if (t_out_channel != t_in_channel):
            error += 1
            dlr_common.DpuError(f"channel mis-match", flush=True)
        if (t_out_size != t_in_size):
            error += 1
            dlr_common.DpuError(f"channel mis-match", flush=True)
        if verbose:
            dlr_common.DpuInfo(f"out_data   ={out_data.shape}")
            dlr_common.DpuInfo(f"in_data    ={in_data.shape}")
        if (error != 0):
            dlr_common.DpuError("parameter mis-match", flush=True)
            return False
    #_fname=''
    #_ctype=''
    if out_data.dtype.type == np.int32:
        _fname = 'Norm3dBatchInt'
        _ctype = ctypes.c_int
    elif out_data.dtype.type == np.float32:
        _fname = 'Norm3dBatchFloat'
        _ctype = ctypes.c_float
    elif out_data.dtype.type == np.float64:
        _fname = 'Norm3dBatchDouble'
        _ctype = ctypes.c_double
    else:
        dlr_common.DpuError(" not support " + str(out_data.dtype.type),
                            flush=True)
        return False
    _Norm3dBatch = dlr_common.WrapFunction(
        dlr_common._dlr,
        _fname,
        None  # return type
        ,
        [
            ctypes.POINTER(_ctype)  # out data
            ,
            ctypes.POINTER(_ctype)  # in data
            ,
            ctypes.POINTER(_ctype)  # running_mean
            ,
            ctypes.POINTER(_ctype)  # running_var
            ,
            ctypes.POINTER(_ctype)  # scale
            ,
            ctypes.POINTER(_ctype)  # bias
            ,
            ctypes.c_uint  # in_size
            ,
            ctypes.c_ushort  # scale_size
            ,
            ctypes.c_ushort  # bias_size
            ,
            ctypes.c_ushort  # in_channel
            ,
            ctypes.c_float  # epsilon
            ,
            ctypes.c_int  # rigor
            ,
            ctypes.c_int
        ])  # verbose
    in_channel = in_data.shape[0]
    in_size = int(in_data.size / in_channel)  # num of elements per channel
    CP_out_data = out_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_in_data = in_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_running_mean = running_mean.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_running_var = running_var.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_in_size = ctypes.c_uint(in_size)
    CP_in_channel = ctypes.c_ushort(in_channel)
    CP_epsilon = ctypes.c_float(epsilon)
    CP_rigor = 1 if rigor else 0
    CP_verbose = 1 if verbose else 0
    if (scale is None) or (scale.size == 0):
        CP_scale = ctypes.POINTER(_ctype)()
        CP_scale_size = ctypes.c_ushort(0)
    else:
        CP_scale = scale.ctypes.data_as(ctypes.POINTER(_ctype))
        CP_scale_size = ctypes.c_ushort(scale.shape[0])
    if (bias is None) or (bias.size == 0):
        CP_bias = ctypes.POINTER(_ctype)()
        CP_bias_size = ctypes.c_ushort(0)
    else:
        CP_bias = bias.ctypes.data_as(ctypes.POINTER(_ctype))
        CP_bias_size = ctypes.c_ushort(bias.shape[0])
    _Norm3dBatch(CP_out_data, CP_in_data, CP_running_mean, CP_running_var,
                 CP_scale, CP_bias, CP_in_size, CP_scale_size, CP_bias_size,
                 CP_in_channel, CP_epsilon, CP_rigor, CP_verbose)
    return True
def Concat2d(
        out_data  #
    ,
        in_dataA  # in_rowsA x in_colsA
    ,
        in_dataB  # in_rowsB x in_colsB
    ,
        dim=0,
        rigor=False,
        verbose=False):
    """
    Returns True on success, otherwize returns False
    Applies a 2D Concatenation over two 2-dimensional input data
    Note that all nd-array lists are NumPy (mutable), not PyTorch tensor (immutable).
    :param out_data: <mutable> output data, out_data[][]
    :param in_dataA: input data, in_dataA[in_rowsA][in_colsA]
    :param in_dataB: input data, in_dataB[in_rowsB][in_colsB]
    :param dim: dimension to concatenate, 0 or 1
    :param rigor: check values rigorously when 'True'
    :param verbose: output message more when 'True'
    :return: 'True' on success, 'False' on failure.
    Follwoings are derived from input arguments
    . out_rows:
    . out_cols:
    . in_rowsA:
    . in_colsA:
    . in_rowsB:
    . in_colsB:
    . dim:
    Following is an example usage for PyTorch.
        Concat2d( tensor_out_data.data.numpy()
                    , tenso_in_dataA.data.numpy()
                    , tenso_in_dataB.data.numpy()
                    , dim
                    , rigor=True
                    , verbose=True)
    """
    if rigor:
        error = 0
        if (out_data.ndim != 2):
            error += 1
            if verbose: dlr_common.DpuError("out_data is not 2 dim")
        if (in_dataA.ndim != 2):
            error += 1
            if verbose: dlr_common.DpuError("in_data is not 2 dim")
        if (in_dataB.ndim != 2):
            error += 1
            if verbose: dlr_common.DpuError("in_data is not 2 dim")
        if (dim != 0) and (dim != 1):
            error += 1
            if verbose: dlr_common.DpuError("dim should be 0 or 1")
        t_in_rowsA = in_dataA.shape[0]
        t_in_colsA = in_dataA.shape[1]
        t_in_rowsB = in_dataB.shape[0]
        t_in_colsB = in_dataB.shape[1]
        if dim == 0:
            t_out_rows = in_dataA.shape[0] + in_dataB.shape[0]
            t_out_cols = in_dataA.shape[1]
        else:
            t_out_rows = in_dataA.shape[0]
            t_out_cols = in_dataA.shape[1] + in_dataB.shape[1]
        if (t_out_rows != out_data.shape[0]):
            error += 1
            if verbose: dlr_common.DpuError("out data row count error")
        if (t_out_cols != out_data.shape[1]):
            error += 1
            if verbose: dlr_common.DpuError("out data column count error")
        if dim == 0:
            if (t_in_colsA != t_in_colsB):
                error += 1
                if verbose: dlr_common.DpuError("in dimension eror")
        else:
            if (t_in_rowsA != t_in_rowsB):
                error += 1
                if verbose: dlr_common.DpuError("in dimension eror")
        if verbose:
            dlr_common.DpuInfo(f"out_data={out_data.shape}")
            dlr_common.DpuInfo(f"in_dataA={in_dataA.shape}")
            dlr_common.DpuInfo(f"in_dataB={in_dataB.shape}")
            dlr_common.DpuInfo(f"dim     ={dim}")
        if (error != 0):
            dlr_common.DpuError("parameter mis-match")
            return False
    #_fname=''
    #_ctype=''
    if out_data.dtype.type == np.int32:
        _fname = 'Concat2dInt'
        _ctype = ctypes.c_int
    elif out_data.dtype.type == np.float32:
        _fname = 'Concat2dFloat'
        _ctype = ctypes.c_float
    elif out_data.dtype.type == np.float64:
        _fname = 'Concat2dDouble'
        _ctype = ctypes.c_double
    else:
        dlr_common.DpuError("not support " + str(out_data.dtype.type))
        return False
    _Concat2d = dlr_common.WrapFunction(
        dlr_common._dlr,
        _fname,
        None  # return type
        ,
        [
            ctypes.POINTER(_ctype)  # output
            ,
            ctypes.POINTER(_ctype)  # input
            ,
            ctypes.POINTER(_ctype)  # input
            ,
            ctypes.c_ushort  # in_rowsA
            ,
            ctypes.c_ushort  # in_colsA
            ,
            ctypes.c_ushort  # in_rowsB
            ,
            ctypes.c_ushort  # in_colsB
            ,
            ctypes.c_ubyte  # dim
            ,
            ctypes.c_int  # rigor
            ,
            ctypes.c_int  # verbose
        ])
    CP_out_data = out_data.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_in_dataA = in_dataA.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_in_dataB = in_dataB.ctypes.data_as(ctypes.POINTER(_ctype))
    CP_in_rowsA = ctypes.c_ushort(in_dataA.shape[0])
    CP_in_colsA = ctypes.c_ushort(in_dataA.shape[1])
    CP_in_rowsB = ctypes.c_ushort(in_dataB.shape[0])
    CP_in_colsB = ctypes.c_ushort(in_dataB.shape[1])
    CP_dim = ctypes.c_ubyte(dim)
    CP_rigor = 1 if rigor else 0
    CP_verbose = 1 if verbose else 0

    _Concat2d(CP_out_data, CP_in_dataA, CP_in_dataB, CP_in_rowsA, CP_in_colsA,
              CP_in_rowsB, CP_in_colsB, CP_dim, CP_rigor, CP_verbose)
    return True