def _get_oned_copy_kernel(dtype, shape): copy = r""" __global__ void copy_oned(%(type)s* out, const %(type)s* in, int dim, long long src_str, long long dst_str) { int tid_x = threadIdx.x; int idx = blockIdx.x; idx = (idx << 5) + tid_x; const %(type)s* in0 = in + (src_str * idx); %(type)s* out0 = out + (dst_str * idx); if(idx < dim) *out0 = *in0; } """ code = copy % dict( type=_get_register_type(dtype, memory=True) ) # print code module = SourceModule(code) kernel = module.get_function("copy_oned") kernel.prepare("PPIqq") kernel.grid = (_ceil_div(shape[0], 32), 1, 1) kernel.block = (32, 1, 1) kernel.args = (shape[0], ) return kernel
def __init__(self, transformer, op): super(PoolFpropKernel, self).__init__(transformer) (self.I, ) = (_ for _ in op.call_info()) self.O = op.tensor_description() self.dtype = self.O.dtype self.index = op.index if self.dtype.type is np.float16: clss = "hpool" elif self.dtype.type is np.float32: clss = "spool" else: raise TypeError("Type not supported {}".format(clss)) C, D, H, W, _ = self.I.axes.lengths K, M, P, Q, N = self.O.axes.lengths J, T, R, S, pool_op = itemgetter(*('J', 'T', 'R', 'S', 'op'))(op.pool_params) pad_c, pad_d, pad_h, pad_w = \ itemgetter(*('pad_' + s for s in ('c', 'd', 'h', 'w')))(op.pool_params) str_c, str_d, str_h, str_w = \ itemgetter(*('str_' + s for s in ('c', 'd', 'h', 'w')))(op.pool_params) # default to non-overlapping if str_c is None: str_c = J if str_d is None: str_d = T if str_h is None: str_h = R if str_w is None: str_w = S self.overlap = 1.0 # TODO: detect other forms of gaps if str_c > J or str_d > T or str_h > R or str_w > S: self.gaps = 1 else: self.gaps = 0 self.op = pool_op self.C = C self.K = K self.M = M self.P = P self.Q = Q self.JTRS = (J, T, R, S) self.DHW = (D, H, W) self.MPQ = (M, P, Q) self.padding = (pad_c, pad_d, pad_h, pad_w) self.strides = (str_c, str_d, str_h, str_w) self.dimI = (C, D, H, W, N) self.dimO = (K, M, P, Q, N) self.dimF2 = None self.dimI2 = (C * D * H * W, N) self.dimO2 = (K * M * P * Q, N) self.sizeI = np.product(self.dimI) self.sizeO = np.product(self.dimO) self.nOut = np.product(self.MPQ) * K # precompute some multiplications for fast constant memory access WN = W * N HWN = H * WN DHWN = D * HWN RS = R * S RST = T * RS JRST = J * RST QN = Q * N PQN = P * QN MPQN = M * PQN assert JRST + 32 < 2**16, "Integer division is faster with 16bit numerators" sb_large = { #SB shlP maskP shrP shlQ maskQ shrQ maskN shrN 1: (0, 0x00, 0, 0, 0x00, 0, 0xfff, 32), # 1x1 nnnnn 2: (0, 0x00, 0, 1, 0x10, 4, 0x00f, 4), # 1x2 xnnnn 4: (0, 0x00, 0, 2, 0x18, 3, 0x007, 3), # 1x4 xxnnn 8: (0, 0x00, 0, 3, 0x1c, 2, 0x003, 2), # 1x8 xxxnn 16: (0, 0x00, 0, 4, 0x1e, 1, 0x001, 1), # 1x16 xxxxn 32: (0, 0x00, 0, 5, 0x1f, 0, 0x000, 0), # 1x32 xxxxx } sb_medium = { #SB shlP maskP shrP shlQ maskQ shrQ maskN shrN 8: (1, 0x10, 4, 2, 0x0c, 2, 0x003, 2), # 2x4 yxxnn 16: (1, 0x10, 4, 3, 0x0e, 1, 0x001, 1), # 2x8 yxxxn 32: (1, 0x10, 4, 4, 0x0f, 0, 0x000, 0), # 2x16 yxxxx } sb_small = { #SB shlP maskP shrP shlQ maskQ shrQ maskN shrN 16: (2, 0x18, 3, 2, 0x06, 1, 0x001, 1), # 4x4 yyxxn 32: (2, 0x18, 3, 3, 0x07, 0, 0x000, 0), # 4x8 yyxxx } if N == 1: super_block = 0 elif N < 32: super_block = len(bin(N - 1)) - 2 else: super_block = 5 super_block = 1 << (5 - super_block) # try to minimize the zero overlap in the superblock # but maximize the x dim of the superblock for more contiguous memory access if super_block < 8 or Q > 64: sb_params = sb_large.get(super_block) elif super_block < 16 or Q > 32: sb_params = sb_medium.get(super_block) else: sb_params = sb_small.get(super_block) supP = _ceil_div(P, 1 << sb_params[0]) supQ = _ceil_div(Q, 1 << sb_params[3]) # precompute the magic numbers and shift amounts for integer division magic_RST = _magic32(JRST + 32, RST) magic_RS = _magic32(RST + 32, RS) magic_S = _magic32(RS + 32, S) magic_P = _magic32(M * supP, supP) fprop_name = "fprop_" + pool_op threads = 32 if super_block > 1 else N self.fprop_kernel = [ fprop_name, (supQ, supP * M, K), (threads, 1, 1), _flatten([ N, W, H, D, C, WN, HWN, DHWN, P, Q, magic_P, QN, PQN, MPQN, pad_c, pad_d, pad_h, pad_w, str_c, str_d, str_h, str_w, S, RS, RST, JRST, magic_S, magic_RS, magic_RST, supP, supQ, sb_params ]) ] lut_size = JRST if lut_size % 4 != 0: lut_size += 4 - lut_size % 4 self.fprop_lut_size = super_block * lut_size * 4 self.kernel = pooling.map_string2func( self.fprop_kernel[0], self.dtype.str[1:], self.transformer.runtime.compute_capability)
def __init__(self, transformer, op): super(PoolBpropKernel, self).__init__(transformer) (self.I, ) = (_ for _ in op.call_info()) self.O = op.tensor_description() self.dtype = self.O.dtype self.op = op if not (self.dtype.type in self.supported_types): raise TypeError("Type not supported: {}".format(self.dtype.type)) C, D, H, W, _ = self.O.axes.lengths K, M, P, Q, N = self.I.axes.lengths J, T, R, S, pool_op = itemgetter(*('J', 'T', 'R', 'S', 'op'))(op.pool_params) pad_c, pad_d, pad_h, pad_w = \ itemgetter(*('pad_' + s for s in ('c', 'd', 'h', 'w')))(op.pool_params) str_c, str_d, str_h, str_w = \ itemgetter(*('str_' + s for s in ('c', 'd', 'h', 'w')))(op.pool_params) # default to non-overlapping if str_c is None: str_c = J if str_d is None: str_d = T if str_h is None: str_h = R if str_w is None: str_w = S self.overlap = 1.0 # TODO: detect other forms of gaps if str_c > J or str_d > T or str_h > R or str_w > S: self.gaps = 1 else: self.gaps = 0 self.pool_op = pool_op self.C = C self.K = K self.M = M self.P = P self.Q = Q self.JTRS = (J, T, R, S) self.DHW = (D, H, W) self.MPQ = (M, P, Q) self.padding = (pad_c, pad_d, pad_h, pad_w) self.strides = (str_c, str_d, str_h, str_w) self.dimI = (C, D, H, W, N) self.dimO = (K, M, P, Q, N) self.dimF2 = None self.dimI2 = (C * D * H * W, N) self.dimO2 = (K * M * P * Q, N) self.sizeI = np.product(self.dimI) self.sizeO = np.product(self.dimO) self.nOut = np.product(self.MPQ) * K # precompute some multiplications for fast constant memory access WN = W * N HWN = H * WN DHWN = D * HWN DH = D * H RS = R * S RST = T * RS JRST = J * RST QN = Q * N PQN = P * QN MPQN = M * PQN assert JRST + 32 < 2**16, "Integer division is faster with 16bit numerators" sb_large = { #SB shlP maskP shrP shlQ maskQ shrQ maskN shrN 1 : (0, 0x00, 0, 0, 0x00, 0, 0xfff, 32), # 1x1 nnnnn 2 : (0, 0x00, 0, 1, 0x10, 4, 0x00f, 4), # 1x2 xnnnn 4 : (0, 0x00, 0, 2, 0x18, 3, 0x007, 3), # 1x4 xxnnn 8 : (0, 0x00, 0, 3, 0x1c, 2, 0x003, 2), # 1x8 xxxnn 16 : (0, 0x00, 0, 4, 0x1e, 1, 0x001, 1), # 1x16 xxxxn 32 : (0, 0x00, 0, 5, 0x1f, 0, 0x000, 0), # 1x32 xxxxx } sb_medium = { #SB shlP maskP shrP shlQ maskQ shrQ maskN shrN 8 : (1, 0x10, 4, 2, 0x0c, 2, 0x003, 2), # 2x4 yxxnn 16 : (1, 0x10, 4, 3, 0x0e, 1, 0x001, 1), # 2x8 yxxxn 32 : (1, 0x10, 4, 4, 0x0f, 0, 0x000, 0), # 2x16 yxxxx } sb_small = { #SB shlP maskP shrP shlQ maskQ shrQ maskN shrN 16 : (2, 0x18, 3, 2, 0x06, 1, 0x001, 1), # 4x4 yyxxn 32 : (2, 0x18, 3, 3, 0x07, 0, 0x000, 0), # 4x8 yyxxx } if N == 1: super_block = 0 elif N < 32: super_block = len(bin(N - 1)) - 2 else: super_block = 5 super_block = 1 << (5 - super_block) # try to minimize the zero overlap in the superblock # but maximize the x dim of the superblock for more contiguous memory access if super_block < 8 or Q > 64: sb_params = sb_large.get(super_block) elif super_block < 16 or Q > 32: sb_params = sb_medium.get(super_block) else: sb_params = sb_small.get(super_block) supP = _ceil_div(P, 1 << sb_params[0]) supQ = _ceil_div(Q, 1 << sb_params[3]) # precompute the magic numbers and shift amounts for integer division magic_RST = _magic32(JRST + 32, RST) magic_RS = _magic32(RST + 32, RS) magic_S = _magic32(RS + 32, S) magic_P = _magic32(M * supP, supP) bprop_name = "bprop_" + pool_op threads = 32 if super_block > 1 else N lut_size = JRST if lut_size % 4 != 0: lut_size += 4 - lut_size % 4 self.bprop_lut_size = self.fprop_lut_size = super_block * lut_size * 4 if self.overlap > 0: # we have a special kernel to handle the overlapping avg pooling bprop_name += "_overlap" magic_str_w = _magic32(W + S, str_w) magic_str_h = _magic32(H + R, str_h) magic_str_d = _magic32(D + T, str_d) magic_str_c = _magic32(C + J, str_c) if super_block > 1: bprop_name += "_smallN" if super_block < 8 or W > 64: sb_params = sb_large.get(super_block) elif super_block < 16 or W > 32: sb_params = sb_medium.get(super_block) else: sb_params = sb_small.get(super_block) supH = _ceil_div(H, 1 << sb_params[0]) supW = _ceil_div(W, 1 << sb_params[3]) magic_H = _magic32(D * supH, supH) maxLutSize = \ _ceil_div(S, str_w) * \ _ceil_div(R, str_h) * \ _ceil_div(T, str_d) * \ _ceil_div(J, str_c) #neon_logger.display((supW, D*supH, C), sb_params, maxLutSize) self.bprop_kernel = [bprop_name, (supW, D * supH, C), (threads, 1, 1), _flatten([ N, W, H, D, C, WN, HWN, DHWN, magic_H, pad_w, pad_h, pad_d, pad_c, str_w, str_h, str_d, str_c, magic_str_w, magic_str_h, magic_str_d, magic_str_c, S, R, T, J, RS, RST, JRST, magic_S, magic_RS, magic_RST, Q, P, M, K, QN, PQN, MPQN, supH, supW, sb_params, maxLutSize])] lut_size = maxLutSize if lut_size % 4 != 0: lut_size += 4 - lut_size % 4 self.bprop_lut_size = super_block * lut_size * 4 * 2 else: # The overlap kernel can be much more efficient if we aren't doing superblocking magic_H = _magic32(DH, H) self.bprop_kernel = [bprop_name, (W, DH, C), (threads, 1, 1), _flatten([ N, W, H, D, C, WN, HWN, DHWN, magic_H, pad_w, pad_h, pad_d, pad_c, str_w, str_h, str_d, str_c, magic_str_w, magic_str_h, magic_str_d, magic_str_c, S, R, T, J, RS, RST, JRST, magic_S, magic_RS, magic_RST, Q, P, M, K, QN, PQN, MPQN])] self.bprop_lut_size = lut_size * 4 * 2 else: self.bprop_kernel = [bprop_name, (supQ, supP * M, K), (threads, 1, 1), _flatten([ N, W, H, D, C, WN, HWN, DHWN, P, Q, magic_P, QN, PQN, MPQN, pad_c, pad_d, pad_h, pad_w, str_c, str_d, str_h, str_w, S, RS, RST, JRST, magic_S, magic_RS, magic_RST, supP, supQ, sb_params])] self.kernel = pooling.map_string2func(self.bprop_kernel[0], self.dtype.str[1:], self.transformer.runtime.compute_capability)
def _get_copy_transpose_kernel(dtype, shape, axes=None): if len(shape) == 1: return _get_oned_copy_kernel(dtype, shape) src = list(range(len(shape))) dst = list(axes) src_dim = src[-1] dst_dim = dst[-1] # If the inner dim is the same for both, no need for shared memory tile # Then map the outer source dim to the threadIdx.y values if src_dim == dst_dim: dst_dim = src[0] shared_tile = False else: shared_tile = True src_offset = [] dst_offset = [] params = [] values = [] magic = "" # add dims for bounds checking for dim in (src_dim, dst_dim): params.append("int dim_%s" % dim) values.append(shape[dim]) # collapse src and dst shape by 32 grid_shape = list(shape) grid_shape[src_dim] = _ceil_div(shape[src_dim], 32) grid_shape[dst_dim] = _ceil_div(shape[dst_dim], 32) # get a src list without dst dim src2 = [s for s in src if s != dst_dim] # get the name of the first compound index blkx_name = compound_idx = "".join(native_str(x) for x in src2) # generate the magic number math to extract all indeces while len(src2) > 1: idx1 = src2[0] del src2[0] idx2 = "".join(native_str(i) for i in src2) div = reduce(mul, (grid_shape[i] for i in src2), 1) params.extend(p % idx2 for p in ("int magic_%s", "int shift_%s", "int div_%s")) values.extend(_magic64(div)) values.append(div) magic += r""" int idx_{1} = div64(idx_{0}, magic_{2}, shift_{2}); int idx_{2} = idx_{0} - idx_{1}*div_{2}; """.format(compound_idx, idx1, idx2) compound_idx = idx2 # Add params for src strides and generate src offset # The param values will be added externally for s in src: params.append("long long src_str_%d" % s) src_offset.append("src_str_%d*idx_%d" % (s, s)) # Add params for dst strides and generate dst offset for d in dst: params.append("long long dst_str_%d" % d) dst_offset.append("dst_str_%d*idx_%d" % (d, d)) num_strides = len(src) + len(dst) if shared_tile: copy_transpose = r""" %(common)s __global__ void copy_transpose(%(type)s* out, const %(type)s* in, %(params)s) { __shared__ %(type)s tile[32][33]; int tid_x = threadIdx.x; int tid_y = threadIdx.y; int idx_%(blk)s = blockIdx.x; int idx_%(dst)s = blockIdx.y; %(magic)s idx_%(src)s = (idx_%(src)s << 5) + tid_x; idx_%(dst)s = (idx_%(dst)s << 5) + tid_y; const %(type)s* in00 = in + %(src_offset)s; const %(type)s* in08 = in00 + src_str_%(dst)s*8; const %(type)s* in16 = in08 + src_str_%(dst)s*8; const %(type)s* in24 = in16 + src_str_%(dst)s*8; bool b%(src)s = idx_%(src)s < dim_%(src)s; if (idx_%(dst)s + 0 < dim_%(dst)s && b%(src)s) tile[tid_y + 0][tid_x] = *in00; if (idx_%(dst)s + 8 < dim_%(dst)s && b%(src)s) tile[tid_y + 8][tid_x] = *in08; if (idx_%(dst)s + 16 < dim_%(dst)s && b%(src)s) tile[tid_y + 16][tid_x] = *in16; if (idx_%(dst)s + 24 < dim_%(dst)s && b%(src)s) tile[tid_y + 24][tid_x] = *in24; __syncthreads(); %(type)s val00 = tile[tid_x][tid_y + 0]; %(type)s val08 = tile[tid_x][tid_y + 8]; %(type)s val16 = tile[tid_x][tid_y + 16]; %(type)s val24 = tile[tid_x][tid_y + 24]; idx_%(src)s += tid_y - tid_x; idx_%(dst)s += tid_x - tid_y; bool b%(dst)s = idx_%(dst)s < dim_%(dst)s; %(type)s* out00 = out + %(dst_offset)s; %(type)s* out08 = out00 + dst_str_%(src)s*8; %(type)s* out16 = out08 + dst_str_%(src)s*8; %(type)s* out24 = out16 + dst_str_%(src)s*8; if (idx_%(src)s + 0 < dim_%(src)s && b%(dst)s) *out00 = val00; if (idx_%(src)s + 8 < dim_%(src)s && b%(dst)s) *out08 = val08; if (idx_%(src)s + 16 < dim_%(src)s && b%(dst)s) *out16 = val16; if (idx_%(src)s + 24 < dim_%(src)s && b%(dst)s) *out24 = val24; } """ else: copy_transpose = r""" %(common)s __global__ void copy_transpose(%(type)s* out, const %(type)s* in, %(params)s) { int tid_x = threadIdx.x; int tid_y = threadIdx.y; int idx_%(blk)s = blockIdx.x; int idx_%(dst)s = blockIdx.y; %(magic)s idx_%(src)s = (idx_%(src)s << 5) + tid_x; idx_%(dst)s = (idx_%(dst)s << 5) + tid_y; bool b%(src)s = idx_%(src)s < dim_%(src)s; bool b%(dst)s_00 = idx_%(dst)s + 0 < dim_%(dst)s && b%(src)s; bool b%(dst)s_08 = idx_%(dst)s + 8 < dim_%(dst)s && b%(src)s; bool b%(dst)s_16 = idx_%(dst)s + 16 < dim_%(dst)s && b%(src)s; bool b%(dst)s_24 = idx_%(dst)s + 24 < dim_%(dst)s && b%(src)s; %(type)s val00 = 0; %(type)s val08 = 0; %(type)s val16 = 0; %(type)s val24 = 0; const %(type)s* in00 = in + %(src_offset)s; const %(type)s* in08 = in00 + src_str_%(dst)s*8; const %(type)s* in16 = in08 + src_str_%(dst)s*8; const %(type)s* in24 = in16 + src_str_%(dst)s*8; if (b%(dst)s_00) val00 = *in00; if (b%(dst)s_08) val08 = *in08; if (b%(dst)s_16) val16 = *in16; if (b%(dst)s_24) val24 = *in24; %(type)s* out00 = out + %(dst_offset)s; %(type)s* out08 = out00 + dst_str_%(dst)s*8; %(type)s* out16 = out08 + dst_str_%(dst)s*8; %(type)s* out24 = out16 + dst_str_%(dst)s*8; if (b%(dst)s_00) *out00 = val00; if (b%(dst)s_08) *out08 = val08; if (b%(dst)s_16) *out16 = val16; if (b%(dst)s_24) *out24 = val24; } """ code = copy_transpose % dict( common=_div64, type=_get_register_type(dtype, memory=True), params=", ".join(params), blk=blkx_name, src=src_dim, dst=dst_dim, magic=magic, src_offset=" + ".join(src_offset), dst_offset=" + ".join(dst_offset) ) # print code module = SourceModule(code) kernel = module.get_function("copy_transpose") kernel.prepare("PP" + ("I" * (len(params) - num_strides)) + "q" * num_strides) grid_x = grid_shape[src_dim] grid_y = grid_shape[dst_dim] for s in src: if s not in (src_dim, dst_dim): grid_x *= grid_shape[s] kernel.grid = (grid_x, grid_y, 1) kernel.block = (32, 8, 1) kernel.args = tuple(values) return kernel