def __init__(self, lib, dtype, N, C, K, D=1, H=1, W=1, T=1, R=1, S=1, pad_d=0, pad_h=0, pad_w=0, str_d=1, str_h=1, str_w=1, relu=False, bsum=False, deterministic_update=False): super(ConvLayer, self).__init__(lib, dtype, N, np.float32) vec_size = 4 if self.dtype.itemsize == 4 else 8 assert N % 32 == 0, "N dim must be multiple of 32" assert K % vec_size == 0, "K dim must be multiple of %d" % vec_size if self.dtype.type is np.float16: clss = "hconv" elif self.dtype.type is np.float32: clss = "sconv" else: raise TypeError("Type not supported.") # Compute the output spatial dimensions M = lib.output_dim(D, T, pad_d, str_d) P = lib.output_dim(H, R, pad_h, str_h) Q = lib.output_dim(W, S, pad_w, str_w) self.C = C self.K = K self.M = M self.P = P self.Q = Q self.NCK = (N, C, K) self.TRS = (T, R, S) self.DHW = (D, H, W) self.MPQ = (M, P, Q) self.padding = (pad_d, pad_h, pad_w) self.strides = (str_d, str_h, str_w) self.relu = relu self.bsum = bsum self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w) self.dimI = (C, D, H, W, N) self.dimF = (C, T, R, S, K) self.dimFb = (K, T, R, S, C) self.dimO = (K, M, P, Q, N) self.dimI2 = (C*D*H*W, N) self.dimF2 = (C*T*R*S, K) self.dimF2t = (K, C*T*R*S) self.dimO2 = (K*M*P*Q, N) self.dimS = (K, 1) self.sizeI = reduce(mul, self.dimI, 1) self.sizeF = reduce(mul, self.dimF, 1) self.sizeO = reduce(mul, self.dimO, 1) self.nOut = reduce(mul, self.MPQ, 1) * K # precompute some multiplications for fast constant memory access HW = H*W DHW = D*HW WN = W*N HWN = H*WN DHWN = D*HWN RS = R*S RST = T*RS CRST = C*RST CRSTK = K*CRST KRST = K*RST PQ = P*Q PQM = M*PQ QN = Q*N PQN = P*QN MPQN = M*PQN if CRST > 2**16: assert CRST < 2**16, "Integer division is faster with 16bit numerators" # precompute the magic numbers and shift amounts for integer division magic_HW = _magic64(HW) magic_W = _magic64(W) magic_PQ = _magic64(PQ) magic_Q = _magic64(Q) magic_RST = _magic32(CRST, RST) magic_RS = _magic32(RST+32, RS) magic_S = _magic32(RS+32, S) magic_str_w = _magic32(W + S, str_w) magic_str_h = _magic32(H + R, str_h) magic_str_d = _magic32(D + T, str_d) # flop count for benchmarking self.flops = PQM * K * N * CRST * 2.0 tile_N = 128 if N > 64 else 64 grid_N = _grid_dim(tile_N, N) tiles_CK = (128, 64, 32) if tile_N == 128 else (128, 64) ####### FPROP ########### self.fprop_kernels = kernel_specs.xprop_conv_kernels( clss, "fprop", "K", tile_N, grid_N, K, tiles_CK, PQM, RST, _flatten([N, K, D, H, W, WN, HWN, DHWN, C, KRST, RST, RS, magic_RS, S, magic_S, pad_d, pad_h, pad_w, str_d, str_h, str_w, Q, PQ, QN, PQN, MPQN, magic_Q, magic_PQ])) # shared lookup table size self.fprop_lut_size = RST * 4 * 2 ####### BPROP ########### if C < 16 or C % vec_size != 0: # special kernel for deconv into first layer kernel_name = "%s_bprop_C1_N64" % clss grid = (PQM, _grid_dim(32, CRST), _grid_dim(64, N)) block = (32, 1, 1) self.bprop_kernels = [[kernel_name, grid, block, 0, _flatten([ N, K, D, H, W, WN, HWN, DHWN, C, CRST, RST, magic_RST, RS, magic_RS, S, magic_S, pad_d, pad_h, pad_w, str_d, str_h, str_w, Q, PQ, QN, PQN, MPQN, magic_Q, magic_PQ, CRST*8*self.dtype.itemsize, MPQN*8*self.dtype.itemsize])]] # generate the kernel args for transpose CRST,K => K,CRST self.shuffle_args = [CRST, K] gridX = (K >> 5) + (K & 31 != 0) gridY = (CRST >> 5) + (CRST & 31 != 0) self.shuffle_grid = (gridX, gridY, 1) self.shuffle_block = (32, 8, 1) self.bprop_zero = self.sizeI * self.dtype.itemsize self.bprop_lut_size = 0 else: self.bprop_kernels = kernel_specs.xprop_conv_kernels( clss, "bprop", "C", tile_N, grid_N, C, tiles_CK, DHW, RST, _flatten([ N, C, M, P, Q, QN, PQN, MPQN, K, CRST, RST, RS, magic_RS, S, magic_S, pad_d, pad_h, pad_w, str_d, str_h, str_w, W, HW, WN, HWN, DHWN, magic_W, magic_HW, R, T, magic_str_w, magic_str_h, magic_str_d])) # generate the kernel args for dim shuffling CRSTK => KRSTC self.shuffle_args = _flatten([ RST*K, RS*K, S*K, K, RST*C, RS*C, S*C, C, RS, magic_RS, S, magic_S]) gridX = (K >> 5) + (K & 31 != 0) gridY = (C >> 5) + (C & 31 != 0) self.shuffle_grid = (gridX, gridY, RST) self.shuffle_block = (32, 8, 1) self.bprop_zero = 0 self.bprop_lut_size = RST * 4 * 2 # for k in self.fprop_kernels: print k # for k in self.bprop_kernels: print k # exit() ####### UPDATE ########### grid_C = _grid_dim(128, CRST) sm_count = _get_sm_count() # in float32 for big feature_map layers the smaller tile is actually faster # so restrict tile selection to just that. if self.dtype.type is np.float32 and PQ > 56*56: K_tiles = (64,) else: K_tiles = (128, 64) if deterministic_update: determ = "D" if K <= 64: K_tiles = (64,) else: K_tiles = K_tiles[0:1] self.determ = CRSTK else: determ = "" self.determ = 0 self.updat_kernels = [] for tile_K, grid_K, offset_K in kernel_specs.K_partitions(K, K_tiles): kernel_name = "%s_updat%s_C128_K%d" % (clss, determ, tile_K) base_blocks = M*grid_C*grid_K grid_P, grid_Q, threads = kernel_specs.update_grid(kernel_name, base_blocks, P, Q, sm_count) # print grid_P, grid_Q grid_PQ = grid_P * grid_Q magic_PQu = _magic64(grid_PQ) magic_Qu = _magic64(grid_Q) block = (threads, 1, 1) if RST > 1: grid = (M*grid_PQ, grid_C, grid_K) else: grid = (grid_C, grid_K, M*grid_PQ) self.determ *= M*grid_PQ self.determ_shape = (M*grid_PQ, CRSTK) self.updat_kernels.append([kernel_name, grid, block, offset_K, _flatten([ N, K, D, H, W, WN, HWN, DHWN, C, CRST, RST, magic_RST, RS, magic_RS, S, magic_S, pad_d, pad_h, pad_w, str_d, str_h, str_w, P, Q, PQ, QN, PQN, MPQN, magic_Qu, magic_PQu, grid_P, grid_Q, grid_PQ, CRSTK])])
def __init__(self, lib, dtype, N, C, K, D=1, H=1, W=1, T=1, R=1, S=1, pad_d=0, pad_h=0, pad_w=0, str_d=1, str_h=1, str_w=1, relu=False, bsum=False, deterministic_update=False): super(ConvLayer, self).__init__(lib, dtype, N, np.float32) vec_size = 4 if self.dtype.itemsize == 4 else 8 assert N % 32 == 0, "N dim must be multiple of 32" assert K % vec_size == 0, "K dim must be multiple of %d" % vec_size if self.dtype.type is np.float16: clss = "hconv" elif self.dtype.type is np.float32: clss = "sconv" else: raise TypeError("Type not supported.") # Compute the output spatial dimensions M = int(ceil(float(D - T + 1 + 2*pad_d) / str_d)) P = int(ceil(float(H - R + 1 + 2*pad_h) / str_h)) Q = int(ceil(float(W - S + 1 + 2*pad_w) / str_w)) self.C = C self.K = K self.M = M self.P = P self.Q = Q self.NCK = (N, C, K) self.TRS = (T, R, S) self.DHW = (D, H, W) self.MPQ = (M, P, Q) self.padding = (pad_d, pad_h, pad_w) self.strides = (str_d, str_h, str_w) self.relu = relu self.bsum = bsum self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w) self.dimI = (C, D, H, W, N) self.dimF = (C, T, R, S, K) self.dimFb = (K, T, R, S, C) self.dimO = (K, M, P, Q, N) self.dimI2 = (C*D*H*W, N) self.dimF2 = (C*T*R*S, K) self.dimF2t = (K, C*T*R*S) self.dimO2 = (K*M*P*Q, N) self.dimS = (K, 1) self.sizeI = reduce(mul, self.dimI, 1) self.sizeF = reduce(mul, self.dimF, 1) self.sizeO = reduce(mul, self.dimO, 1) self.nOut = reduce(mul, self.MPQ, 1) * K # precompute some multiplications for fast constant memory access HW = H*W DHW = D*HW WN = W*N HWN = H*WN DHWN = D*HWN RS = R*S RST = T*RS CRST = C*RST KRST = K*RST PQ = P*Q PQM = M*PQ QN = Q*N PQN = P*QN MPQN = M*PQN assert CRST < 2**16, "Integer division is faster with 16bit numerators" # precompute the magic numbers and shift amounts for integer division magic_HW = _magic64(HW) magic_W = _magic64(W) magic_PQ = _magic64(PQ) magic_Q = _magic64(Q) magic_RST = _magic32(CRST, RST) magic_RS = _magic32(RST+32, RS) magic_S = _magic32(RS+32, S) magic_str_w = _magic32(W + S, str_w) magic_str_h = _magic32(H + R, str_h) magic_str_d = _magic32(D + T, str_d) # flop count for benchmarking self.flops = PQM * K * N * CRST * 2.0 tile_N = 128 if N > 64 else 64 grid_N = _grid_dim(tile_N, N) tiles_CK = (128, 64, 32) if tile_N == 128 else (128, 64) ####### FPROP ########### self.fprop_kernels = kernel_specs.xprop_conv_kernels( clss, "fprop", "K", tile_N, grid_N, K, tiles_CK, PQM, RST, _flatten([N, K, D, H, W, WN, HWN, DHWN, C, KRST, RST, RS, magic_RS, S, magic_S, pad_d, pad_h, pad_w, str_d, str_h, str_w, Q, PQ, QN, PQN, MPQN, magic_Q, magic_PQ])) # shared lookup table size self.fprop_lut_size = RST * 4 * 2 ####### BPROP ########### if C < 16 or C % vec_size != 0: # special kernel for deconv into first layer kernel_name = "%s_bprop_C1_N64" % clss grid = (PQM, _grid_dim(32, CRST), _grid_dim(64, N)) block = (32, 1, 1) self.bprop_kernels = [[kernel_name, grid, block, 0, _flatten([ N, K, D, H, W, WN, HWN, DHWN, C, CRST, RST, magic_RST, RS, magic_RS, S, magic_S, pad_d, pad_h, pad_w, str_d, str_h, str_w, Q, PQ, QN, PQN, MPQN, magic_Q, magic_PQ, CRST*8*self.dtype.itemsize, MPQN*8*self.dtype.itemsize])]] # generate the kernel args for transpose CRST,K => K,CRST self.shuffle_args = [CRST, K] gridX = (K >> 5) + (K & 31 != 0) gridY = (CRST >> 5) + (CRST & 31 != 0) self.shuffle_grid = (gridX, gridY, 1) self.shuffle_block = (32, 8, 1) self.bprop_zero = self.sizeI * self.dtype.itemsize self.bprop_lut_size = 0 else: self.bprop_kernels = kernel_specs.xprop_conv_kernels( clss, "bprop", "C", tile_N, grid_N, C, tiles_CK, DHW, RST, _flatten([ N, C, M, P, Q, QN, PQN, MPQN, K, CRST, RST, RS, magic_RS, S, magic_S, pad_d, pad_h, pad_w, str_d, str_h, str_w, W, HW, WN, HWN, DHWN, magic_W, magic_HW, R, T, magic_str_w, magic_str_h, magic_str_d])) # generate the kernel args for dim shuffling CRSTK => KRSTC self.shuffle_args = _flatten([ RST*K, RS*K, S*K, K, RST*C, RS*C, S*C, C, RS, magic_RS, S, magic_S]) gridX = (K >> 5) + (K & 31 != 0) gridY = (C >> 5) + (C & 31 != 0) self.shuffle_grid = (gridX, gridY, RST) self.shuffle_block = (32, 8, 1) self.bprop_zero = 0 self.bprop_lut_size = RST * 4 * 2 # for k in self.fprop_kernels: print k # for k in self.bprop_kernels: print k # exit() ####### UPDATE ########### # in float32 for big feature_map layers the smaller tile is actually faster # so restrict tile selection to just that. if self.dtype.type is np.float32 and PQ > 56*56: K_tiles = (64,) else: K_tiles = (128, 64) grid_C = _grid_dim(128, CRST) sm_count = _get_sm_count() self.updat_kernels = [] for tile_K, grid_K, offset_K in kernel_specs.K_partitions(K, K_tiles): kernel_name = "%s_updat_C128_K%d" % (clss, tile_K) base_blocks = M*grid_C*grid_K grid_P, grid_Q, threads = kernel_specs.update_grid(kernel_name, base_blocks, P, Q, sm_count) if deterministic_update: grid_P, grid_Q = 1, 1 # print grid_P, grid_Q grid_PQ = grid_P * grid_Q magic_PQu = _magic64(grid_PQ) magic_Qu = _magic64(grid_Q) block = (threads, 1, 1) if RST > 1: grid = (M*grid_PQ, grid_C, grid_K) else: grid = (grid_C, grid_K, M*grid_PQ) self.updat_kernels.append([kernel_name, grid, block, offset_K, _flatten([ N, K, D, H, W, WN, HWN, DHWN, C, CRST, RST, magic_RST, RS, magic_RS, S, magic_S, pad_d, pad_h, pad_w, str_d, str_h, str_w, P, Q, PQ, QN, PQN, MPQN, magic_Qu, magic_PQu, grid_P, grid_Q, grid_PQ])])