def _init_thread_memory(self, dev_id: int, ctx: cuda.Context, alloc_size: int) -> None: ''' Single thread that initializes the memory for all the stream for a single GPU. ''' ctx.push() size_per_batch = np.int32(np.ceil(alloc_size / self.num_stream)) # Initialize streams for i in range(self.num_stream): self.streams[dev_id].append(cuda.Stream()) for i in range(0, self.num_stream, 1): # allocate memory on device self.moments_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 10)))) self.w_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) self.x_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) self.y_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) # set host memory for returned output self.c_moments[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 7)))) self.mu[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.yf[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.m1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 5)))) self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(0), size_per_batch, size_per_batch, block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i]) self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(1), size_per_batch, np.int32(0), block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i]) self.x1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.w1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.x2[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.w2[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) ctx.synchronize() ctx.pop()
def _set_thread_args(self, dev_id: int, ctx: cuda.Context, moment: np.ndarray, w_out: np.ndarray, x_out: np.ndarray, y_out: np.ndarray): ''' Set the input moment for all the stream for a specific GPU ''' ctx.push() # number of input for this GPU max_size = moment.shape[1] # loop through the streams to set their input for i in range(0, self.num_stream, 1): # Size of input allocated for each stream size_per_batch = int(np.ceil(max_size / self.num_stream)) # location on the original input array where the input to this stream starts loc = np.int32((i) * size_per_batch) if loc + size_per_batch > max_size: size_per_batch = max_size - loc self.moment_chunk_host[dev_id].append( np.ascontiguousarray(moment[:, loc:loc + size_per_batch], dtype=np.float32)) self.moment_chunk_host[dev_id][i] = cuda.register_host_memory( self.moment_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.w_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(w_out[:, loc:loc + size_per_batch]))) self.w_chunk_host[dev_id][i] = cuda.register_host_memory( self.w_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.x_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(x_out[:, loc:loc + size_per_batch]))) self.x_chunk_host[dev_id][i] = cuda.register_host_memory( self.x_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.y_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(y_out[:, loc:loc + size_per_batch]))) self.y_chunk_host[dev_id][i] = cuda.register_host_memory( self.y_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) ctx.synchronize() ctx.pop()