예제 #1
0
파일: layer_gpu.py 프로젝트: bin2000/neon
    def __init__(self, lib, dtype,
                 N, C, K,
                 D=1, H=1, W=1,
                 T=1, R=1, S=1,
                 pad_d=0, pad_h=0, pad_w=0,
                 str_d=1, str_h=1, str_w=1,
                 relu=False, bsum=False,
                 deterministic_update=False):

        super(ConvLayer, self).__init__(lib, dtype, N, np.float32)

        vec_size = 4 if self.dtype.itemsize == 4 else 8

        assert N % 32 == 0, "N dim must be multiple of 32"
        assert K % vec_size == 0, "K dim must be multiple of %d" % vec_size

        if self.dtype.type is np.float16:
            clss = "hconv"
        elif self.dtype.type is np.float32:
            clss = "sconv"
        else:
            raise TypeError("Type not supported.")

        # Compute the output spatial dimensions
        M = lib.output_dim(D, T, pad_d, str_d)
        P = lib.output_dim(H, R, pad_h, str_h)
        Q = lib.output_dim(W, S, pad_w, str_w)

        self.C = C
        self.K = K
        self.M = M
        self.P = P
        self.Q = Q
        self.NCK = (N, C, K)
        self.TRS = (T, R, S)
        self.DHW = (D, H, W)
        self.MPQ = (M, P, Q)
        self.padding = (pad_d, pad_h, pad_w)
        self.strides = (str_d, str_h, str_w)
        self.relu = relu
        self.bsum = bsum

        self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w)

        self.dimI   = (C, D, H, W, N)
        self.dimF   = (C, T, R, S, K)
        self.dimFb  = (K, T, R, S, C)
        self.dimO   = (K, M, P, Q, N)
        self.dimI2  = (C*D*H*W, N)
        self.dimF2  = (C*T*R*S, K)
        self.dimF2t = (K, C*T*R*S)
        self.dimO2  = (K*M*P*Q, N)
        self.dimS   = (K, 1)
        self.sizeI  = reduce(mul, self.dimI, 1)
        self.sizeF  = reduce(mul, self.dimF, 1)
        self.sizeO  = reduce(mul, self.dimO, 1)
        self.nOut   = reduce(mul, self.MPQ, 1) * K

        # precompute some multiplications for fast constant memory access
        HW    = H*W
        DHW   = D*HW
        WN    = W*N
        HWN   = H*WN
        DHWN  = D*HWN
        RS    = R*S
        RST   = T*RS
        CRST  = C*RST
        CRSTK = K*CRST
        KRST  = K*RST
        PQ    = P*Q
        PQM   = M*PQ
        QN    = Q*N
        PQN   = P*QN
        MPQN  = M*PQN

        if CRST > 2**16:
            assert CRST  < 2**16, "Integer division is faster with 16bit numerators"

        # precompute the magic numbers and shift amounts for integer division
        magic_HW    = _magic64(HW)
        magic_W     = _magic64(W)
        magic_PQ    = _magic64(PQ)
        magic_Q     = _magic64(Q)
        magic_RST   = _magic32(CRST, RST)
        magic_RS    = _magic32(RST+32, RS)
        magic_S     = _magic32(RS+32, S)
        magic_str_w = _magic32(W + S, str_w)
        magic_str_h = _magic32(H + R, str_h)
        magic_str_d = _magic32(D + T, str_d)

        # flop count for benchmarking
        self.flops = PQM * K * N * CRST * 2.0

        tile_N   = 128 if N > 64 else 64
        grid_N   = _grid_dim(tile_N, N)
        tiles_CK = (128, 64, 32) if tile_N == 128 else (128, 64)

        ####### FPROP ###########
        self.fprop_kernels = kernel_specs.xprop_conv_kernels(
            clss, "fprop", "K", tile_N, grid_N, K, tiles_CK, PQM, RST,
            _flatten([N, K, D, H, W, WN, HWN, DHWN,
                      C, KRST, RST, RS, magic_RS, S, magic_S,
                      pad_d, pad_h, pad_w, str_d, str_h, str_w,
                      Q, PQ, QN, PQN, MPQN, magic_Q, magic_PQ]))

        # shared lookup table size
        self.fprop_lut_size = RST * 4 * 2

        ####### BPROP ###########
        if C < 16 or C % vec_size != 0:
            # special kernel for deconv into first layer
            kernel_name = "%s_bprop_C1_N64" % clss

            grid  = (PQM, _grid_dim(32, CRST), _grid_dim(64, N))
            block = (32, 1, 1)

            self.bprop_kernels = [[kernel_name, grid, block, 0, _flatten([
                N, K, D, H, W, WN, HWN, DHWN,
                C, CRST, RST, magic_RST, RS, magic_RS, S, magic_S,
                pad_d, pad_h, pad_w, str_d, str_h, str_w,
                Q, PQ, QN, PQN, MPQN, magic_Q, magic_PQ,
                CRST*8*self.dtype.itemsize, MPQN*8*self.dtype.itemsize])]]

            # generate the kernel args for transpose CRST,K => K,CRST
            self.shuffle_args = [CRST, K]
            gridX   = (K    >> 5) + (K    & 31 != 0)
            gridY   = (CRST >> 5) + (CRST & 31 != 0)
            self.shuffle_grid   = (gridX, gridY, 1)
            self.shuffle_block  = (32, 8, 1)
            self.bprop_zero     = self.sizeI * self.dtype.itemsize
            self.bprop_lut_size = 0

        else:

            self.bprop_kernels = kernel_specs.xprop_conv_kernels(
                clss, "bprop", "C", tile_N, grid_N, C, tiles_CK, DHW, RST, _flatten([
                    N, C, M, P, Q, QN, PQN, MPQN,
                    K, CRST, RST, RS, magic_RS, S, magic_S,
                    pad_d, pad_h, pad_w, str_d, str_h, str_w,
                    W, HW, WN, HWN, DHWN, magic_W, magic_HW,
                    R, T, magic_str_w, magic_str_h, magic_str_d]))

            # generate the kernel args for dim shuffling CRSTK => KRSTC
            self.shuffle_args = _flatten([
                RST*K, RS*K, S*K, K,
                RST*C, RS*C, S*C, C,
                RS, magic_RS, S, magic_S])
            gridX = (K >> 5) + (K & 31 != 0)
            gridY = (C >> 5) + (C & 31 != 0)
            self.shuffle_grid   = (gridX, gridY, RST)
            self.shuffle_block  = (32, 8, 1)
            self.bprop_zero     = 0
            self.bprop_lut_size = RST * 4 * 2

        # for k in self.fprop_kernels: print k
        # for k in self.bprop_kernels: print k
        # exit()

        ####### UPDATE ###########

        grid_C   = _grid_dim(128, CRST)
        sm_count = _get_sm_count()

        # in float32 for big feature_map layers the smaller tile is actually faster
        # so restrict tile selection to just that.
        if self.dtype.type is np.float32 and PQ > 56*56:
            K_tiles = (64,)
        else:
            K_tiles = (128, 64)

        if deterministic_update:
            determ = "D"
            if K <= 64:
                K_tiles = (64,)
            else:
                K_tiles = K_tiles[0:1]
            self.determ = CRSTK
        else:
            determ = ""
            self.determ = 0


        self.updat_kernels = []
        for tile_K, grid_K, offset_K in kernel_specs.K_partitions(K, K_tiles):

            kernel_name = "%s_updat%s_C128_K%d" % (clss, determ, tile_K)
            base_blocks = M*grid_C*grid_K

            grid_P, grid_Q, threads = kernel_specs.update_grid(kernel_name, base_blocks, P, Q, sm_count)
            # print grid_P, grid_Q

            grid_PQ   = grid_P * grid_Q
            magic_PQu = _magic64(grid_PQ)
            magic_Qu  = _magic64(grid_Q)

            block = (threads, 1, 1)
            if RST > 1:
                grid = (M*grid_PQ, grid_C, grid_K)
            else:
                grid = (grid_C, grid_K, M*grid_PQ)

            self.determ *= M*grid_PQ
            self.determ_shape = (M*grid_PQ, CRSTK)

            self.updat_kernels.append([kernel_name, grid, block, offset_K, _flatten([
                N, K, D, H, W, WN, HWN, DHWN,
                C, CRST, RST, magic_RST, RS, magic_RS, S, magic_S,
                pad_d, pad_h, pad_w, str_d, str_h, str_w,
                P, Q, PQ, QN, PQN, MPQN, magic_Qu, magic_PQu,
                grid_P, grid_Q, grid_PQ, CRSTK])])
예제 #2
0
    def __init__(self, lib, dtype,
                 N, C, K,
                 D=1, H=1, W=1,
                 T=1, R=1, S=1,
                 pad_d=0, pad_h=0, pad_w=0,
                 str_d=1, str_h=1, str_w=1,
                 relu=False, bsum=False,
                 deterministic_update=False):

        super(ConvLayer, self).__init__(lib, dtype, N, np.float32)

        vec_size = 4 if self.dtype.itemsize == 4 else 8

        assert N % 32 == 0, "N dim must be multiple of 32"
        assert K % vec_size == 0, "K dim must be multiple of %d" % vec_size

        if self.dtype.type is np.float16:
            clss = "hconv"
        elif self.dtype.type is np.float32:
            clss = "sconv"
        else:
            raise TypeError("Type not supported.")

        # Compute the output spatial dimensions
        M = int(ceil(float(D - T + 1 + 2*pad_d) / str_d))
        P = int(ceil(float(H - R + 1 + 2*pad_h) / str_h))
        Q = int(ceil(float(W - S + 1 + 2*pad_w) / str_w))

        self.C = C
        self.K = K
        self.M = M
        self.P = P
        self.Q = Q
        self.NCK = (N, C, K)
        self.TRS = (T, R, S)
        self.DHW = (D, H, W)
        self.MPQ = (M, P, Q)
        self.padding = (pad_d, pad_h, pad_w)
        self.strides = (str_d, str_h, str_w)
        self.relu = relu
        self.bsum = bsum

        self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w)

        self.dimI   = (C, D, H, W, N)
        self.dimF   = (C, T, R, S, K)
        self.dimFb  = (K, T, R, S, C)
        self.dimO   = (K, M, P, Q, N)
        self.dimI2  = (C*D*H*W, N)
        self.dimF2  = (C*T*R*S, K)
        self.dimF2t = (K, C*T*R*S)
        self.dimO2  = (K*M*P*Q, N)
        self.dimS   = (K, 1)
        self.sizeI  = reduce(mul, self.dimI, 1)
        self.sizeF  = reduce(mul, self.dimF, 1)
        self.sizeO  = reduce(mul, self.dimO, 1)
        self.nOut   = reduce(mul, self.MPQ, 1) * K

        # precompute some multiplications for fast constant memory access
        HW   = H*W
        DHW  = D*HW
        WN   = W*N
        HWN  = H*WN
        DHWN = D*HWN
        RS   = R*S
        RST  = T*RS
        CRST = C*RST
        KRST = K*RST
        PQ   = P*Q
        PQM  = M*PQ
        QN   = Q*N
        PQN  = P*QN
        MPQN = M*PQN

        assert CRST  < 2**16, "Integer division is faster with 16bit numerators"

        # precompute the magic numbers and shift amounts for integer division
        magic_HW    = _magic64(HW)
        magic_W     = _magic64(W)
        magic_PQ    = _magic64(PQ)
        magic_Q     = _magic64(Q)
        magic_RST   = _magic32(CRST, RST)
        magic_RS    = _magic32(RST+32, RS)
        magic_S     = _magic32(RS+32, S)
        magic_str_w = _magic32(W + S, str_w)
        magic_str_h = _magic32(H + R, str_h)
        magic_str_d = _magic32(D + T, str_d)

        # flop count for benchmarking
        self.flops = PQM * K * N * CRST * 2.0

        tile_N   = 128 if N > 64 else 64
        grid_N   = _grid_dim(tile_N, N)
        tiles_CK = (128, 64, 32) if tile_N == 128 else (128, 64)

        ####### FPROP ###########
        self.fprop_kernels = kernel_specs.xprop_conv_kernels(
            clss, "fprop", "K", tile_N, grid_N, K, tiles_CK, PQM, RST,
            _flatten([N, K, D, H, W, WN, HWN, DHWN,
                      C, KRST, RST, RS, magic_RS, S, magic_S,
                      pad_d, pad_h, pad_w, str_d, str_h, str_w,
                      Q, PQ, QN, PQN, MPQN, magic_Q, magic_PQ]))

        # shared lookup table size
        self.fprop_lut_size = RST * 4 * 2

        ####### BPROP ###########
        if C < 16 or C % vec_size != 0:
            # special kernel for deconv into first layer
            kernel_name = "%s_bprop_C1_N64" % clss

            grid  = (PQM, _grid_dim(32, CRST), _grid_dim(64, N))
            block = (32, 1, 1)

            self.bprop_kernels = [[kernel_name, grid, block, 0, _flatten([
                N, K, D, H, W, WN, HWN, DHWN,
                C, CRST, RST, magic_RST, RS, magic_RS, S, magic_S,
                pad_d, pad_h, pad_w, str_d, str_h, str_w,
                Q, PQ, QN, PQN, MPQN, magic_Q, magic_PQ,
                CRST*8*self.dtype.itemsize, MPQN*8*self.dtype.itemsize])]]

            # generate the kernel args for transpose CRST,K => K,CRST
            self.shuffle_args = [CRST, K]
            gridX   = (K    >> 5) + (K    & 31 != 0)
            gridY   = (CRST >> 5) + (CRST & 31 != 0)
            self.shuffle_grid   = (gridX, gridY, 1)
            self.shuffle_block  = (32, 8, 1)
            self.bprop_zero     = self.sizeI * self.dtype.itemsize
            self.bprop_lut_size = 0

        else:

            self.bprop_kernels = kernel_specs.xprop_conv_kernels(
                clss, "bprop", "C", tile_N, grid_N, C, tiles_CK, DHW, RST, _flatten([
                    N, C, M, P, Q, QN, PQN, MPQN,
                    K, CRST, RST, RS, magic_RS, S, magic_S,
                    pad_d, pad_h, pad_w, str_d, str_h, str_w,
                    W, HW, WN, HWN, DHWN, magic_W, magic_HW,
                    R, T, magic_str_w, magic_str_h, magic_str_d]))

            # generate the kernel args for dim shuffling CRSTK => KRSTC
            self.shuffle_args = _flatten([
                RST*K, RS*K, S*K, K,
                RST*C, RS*C, S*C, C,
                RS, magic_RS, S, magic_S])
            gridX = (K >> 5) + (K & 31 != 0)
            gridY = (C >> 5) + (C & 31 != 0)
            self.shuffle_grid   = (gridX, gridY, RST)
            self.shuffle_block  = (32, 8, 1)
            self.bprop_zero     = 0
            self.bprop_lut_size = RST * 4 * 2

        # for k in self.fprop_kernels: print k
        # for k in self.bprop_kernels: print k
        # exit()

        ####### UPDATE ###########
        # in float32 for big feature_map layers the smaller tile is actually faster
        # so restrict tile selection to just that.
        if self.dtype.type is np.float32 and PQ > 56*56:
            K_tiles = (64,)
        else:
            K_tiles = (128, 64)

        grid_C   = _grid_dim(128, CRST)
        sm_count = _get_sm_count()

        self.updat_kernels = []
        for tile_K, grid_K, offset_K in kernel_specs.K_partitions(K, K_tiles):

            kernel_name = "%s_updat_C128_K%d" % (clss, tile_K)
            base_blocks = M*grid_C*grid_K

            grid_P, grid_Q, threads = kernel_specs.update_grid(kernel_name, base_blocks, P, Q,
                                                               sm_count)

            if deterministic_update:
                grid_P, grid_Q = 1, 1

            # print grid_P, grid_Q

            grid_PQ   = grid_P * grid_Q
            magic_PQu = _magic64(grid_PQ)
            magic_Qu  = _magic64(grid_Q)

            block = (threads, 1, 1)
            if RST > 1:
                grid = (M*grid_PQ, grid_C, grid_K)
            else:
                grid = (grid_C, grid_K, M*grid_PQ)

            self.updat_kernels.append([kernel_name, grid, block, offset_K, _flatten([
                N, K, D, H, W, WN, HWN, DHWN,
                C, CRST, RST, magic_RST, RS, magic_RS, S, magic_S,
                pad_d, pad_h, pad_w, str_d, str_h, str_w,
                P, Q, PQ, QN, PQN, MPQN, magic_Qu, magic_PQu,
                grid_P, grid_Q, grid_PQ])])