예제 #1
0
    def _init_thread_memory(self, dev_id: int, ctx: cuda.Context,
                            alloc_size: int) -> None:
        '''
        Single thread that initializes the memory for all the stream for a single 
        GPU. 
        '''

        ctx.push()
        size_per_batch = np.int32(np.ceil(alloc_size / self.num_stream))
        # Initialize streams
        for i in range(self.num_stream):
            self.streams[dev_id].append(cuda.Stream())

        for i in range(0, self.num_stream, 1):
            # allocate memory on device
            self.moments_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 10))))
            self.w_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))
            self.x_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))
            self.y_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))

            # set host memory for returned output
            self.c_moments[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 7))))
            self.mu[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.yf[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))

            self.m1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 5))))
            self.float_value_set[dev_id](self.m1[dev_id][i],
                                         np.float32(0),
                                         size_per_batch,
                                         size_per_batch,
                                         block=self.block_size,
                                         grid=self.grid_size,
                                         stream=self.streams[dev_id][i])
            self.float_value_set[dev_id](self.m1[dev_id][i],
                                         np.float32(1),
                                         size_per_batch,
                                         np.int32(0),
                                         block=self.block_size,
                                         grid=self.grid_size,
                                         stream=self.streams[dev_id][i])

            self.x1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.w1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.x2[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.w2[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))

        ctx.synchronize()
        ctx.pop()
예제 #2
0
    def _set_thread_args(self, dev_id: int, ctx: cuda.Context,
                         moment: np.ndarray, w_out: np.ndarray,
                         x_out: np.ndarray, y_out: np.ndarray):
        '''
        Set the input moment for all the stream for a specific GPU
        '''

        ctx.push()
        # number of input for this GPU
        max_size = moment.shape[1]
        # loop through the streams to set their input
        for i in range(0, self.num_stream, 1):
            # Size of input allocated for each stream
            size_per_batch = int(np.ceil(max_size / self.num_stream))
            # location on the original input array where the input to this stream starts
            loc = np.int32((i) * size_per_batch)
            if loc + size_per_batch > max_size:
                size_per_batch = max_size - loc

            self.moment_chunk_host[dev_id].append(
                np.ascontiguousarray(moment[:, loc:loc + size_per_batch],
                                     dtype=np.float32))
            self.moment_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.moment_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)
            self.w_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(w_out[:, loc:loc + size_per_batch])))
            self.w_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.w_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

            self.x_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(x_out[:, loc:loc + size_per_batch])))
            self.x_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.x_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

            self.y_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(y_out[:, loc:loc + size_per_batch])))
            self.y_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.y_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

        ctx.synchronize()
        ctx.pop()