def forward_gpu(self, inputs): img, labels = inputs # ------------------ # INPUT VERIFICATION # ------------------ assert img.flags["C_CONTIGUOUS"] assert len(labels.shape) >= 4 assert img.dtype == cp.float32 or img.dtype == cp.int32 assert (labels.flags["C_CONTIGUOUS"]) # ---------- # INITIALIZE # ---------- volumeSize = np.prod(img.shape[-3:]) blockSize = np.min((CUDA_MAX_THREADS, volumeSize)) nbPixPerThread = int(math.ceil(volumeSize / float(blockSize))) K = int(labels.max() + 1) # ------------------------------- # FIGURE OUT MEANING OF EACH AXIS # ------------------------------- dimension = len(img.shape) batch_size, n_classes, outputs, counts, expand_axis = self.initialize_arrays( img, dimension, K) self.counts = counts self.code = read_code( GPU_KERNEL ) # TODO: Should be able to be moved outside this function. # # --- # # PERFORM AVERAGE ON GPU # # --- summation = load_kernel('avg_pooling', self.code) # print("labels: ", cp.ravel(labels)) args = (img, labels.astype(cp.int32), outputs, self.counts, volumeSize, n_classes, batch_size, nbPixPerThread, K) block = (blockSize, ) # block size = size of one volume (one block per class) grid = (np.prod(img.shape[:-3]), batch_size) # 1 block for each class summation(grid, block, args) if self.divide: if expand_axis is not None: outputs /= cp.repeat(cp.expand_dims(self.counts, expand_axis), n_classes, expand_axis) else: outputs /= self.counts # TODO maybe write kernel for this if it seems that cupy doesn't parallellize this. # If it does, a new call to kernel might cause too much overhead. return outputs,
def forward_gpu(self, inputs): img, labels = inputs self.max_indices = cp.zeros(img.size, dtype=cp.int32) volumeSize = np.prod(img.shape[1:]) blockSizeX = np.min((64, volumeSize)) blockSizeY = 1 blockSizeZ = 1 nbBlocksX = int(math.ceil(volumeSize / float(blockSizeX))) K = int(labels.max() + 1) outputs = (-np.inf * cp.ones((img.shape[0], K))).astype(img.dtype) self.max_indices = -cp.ones( outputs.shape, dtype=cp.int32 ) # Initialize as -1 so negative values can be ignored in backward pass. # This is a bit wasteful, only saving the ones that matter is better, TODO: look at this later self.code = read_code( GPU_KERNEL ) # TODO: Should be able to be moved outside this function. But it needs the information in config ... kern = load_kernel('max_pooling', self.code) args = (img, labels, self.max_indices, volumeSize, img.shape[0], K) block = (blockSizeX, blockSizeY, blockSizeZ ) # block size = size of one volume (one block per class) grid = (nbBlocksX, img.shape[0], K) # print("indices before: ", self.max_indices) kern(grid, block, shared_mem=blockSizeX, args=args) fill_vals = load_kernel('fill_values', self.code) blockSizeX = 16 blockSizeY = 16 nbBlocksX = int(math.ceil(img.shape[0] / float(blockSizeX))) nbBlocksY = int(math.ceil(K / float(blockSizeY))) block = (blockSizeX, blockSizeY) grid = (nbBlocksX, nbBlocksY) args = (img, self.max_indices, K, img.shape[0], outputs) fill_vals(grid, block, args=args) # print("indices after: ", self.max_indices) return outputs,
def forward_gpu(self, inputs): img, labels = inputs # ------------------ # INPUT VERIFICATION # ------------------ assert img.dtype == cp.float32 or img.dtype == cp.int32 assert labels.dtype == cp.int32 or labels.dtype == cp.int64 assert len(labels.shape) >= 4 labels = labels.astype(cp.int32) # ---------- # INITIALIZE # ---------- volumeSize = np.prod(img.shape[-3:]) blockSize = np.min((CUDA_MAX_THREADS, volumeSize)) nbPixPerThread = int(math.ceil(volumeSize / float(blockSize))) K = int(labels.max() + 1) # ------------------------------- # FIGURE OUT MEANING OF EACH AXIS # ------------------------------- dimension = len(img.shape) batch_size, n_classes, outputs = self.initialize_arrays( img, dimension, K) self.max_indices = -cp.ones( outputs.shape, dtype=cp.int32 ) # Initialize as -1 so negative values can be ignored in backward pass. # This is a bit wasteful, only saving the ones that matter is better, TODO: look at this later self.code = read_code( GPU_KERNEL ) # TODO: Should be able to be moved outside this function. But it needs the information in config ... # --- # PERFORM ARG MAX ON GPU # --- kern = load_kernel('max_pooling_v2', self.code) args = (img, labels.astype(cp.int32), self.max_indices, volumeSize, n_classes, batch_size, nbPixPerThread, K) block = (blockSize, ) # block size = size of one volume (one block per class) grid = (np.prod(img.shape[:-3]), batch_size) # 1 block for each class kern(grid, block, args) # print("max_indices: ", self.max_indices) # print("corresponding labels: ", cp.ravel(labels)[self.max_indices]) # --- # FILL IN CORRESPONDING VALUES # --- fill_vals = load_kernel('fill_values', self.code) blockSizeX = 16 blockSizeY = CUDA_MAX_THREADS / blockSizeX nbBlocksX = int(math.ceil(n_classes / float(blockSizeX))) nbBlocksY = int(math.ceil(K / float(blockSizeY))) block = (blockSizeX, blockSizeY) grid = (nbBlocksX, nbBlocksY, batch_size) args = (img, self.max_indices, K, n_classes, batch_size, outputs) fill_vals(grid, block, args=args) return outputs,