def backward_gpu(self, inputs, gy): x = inputs[0] W = inputs[1] # Backprop weight gW = cuda.cupy.empty_like(W) handle = cuda.Device().cublas_handle k, n = gy[0].shape[0] * gy[0].shape[1], W.shape[0] m = W.shape[1] lda = max(1, x.shape[-1]) ldb = max(1, gy[0].shape[-1]) ldc = max(1, m) sgemm(handle, False, True, m, n, k, 1, gy[0].data.ptr, ldb, x.data.ptr, lda, 1, gW.data.ptr, ldc) # Backprop input m, k = W.shape n, l = x.shape[0] * x.shape[1], gy[0].shape[2] lda = max(1, gy[0].shape[-1]) ldb = max(1, W.shape[1]) ldc = max(1, m) gx = cuda.cupy.empty_like(x) sgemm(handle, True, False, m, n, k, 1, W.data.ptr, ldb, gy[0].data.ptr, lda, 0, gx.data.ptr, ldc) # Backprop bias if len(inputs) > 2: gy_2d = _as_mat(gy[0]) gb = gy_2d.sum(0) return gx, gW, gb else: return gx, gW
def forward_gpu(self, inputs): x = inputs[0] W = inputs[1] # Prepare BLAS call handle = cuda.Device().cublas_handle k, m = W.shape n, l = x.shape[0] * x.shape[1], x.shape[2] lda = max(1, x.shape[-1]) ldb = max(1, W.strides[0] // W.dtype.itemsize) ldc = max(1, m) Wx = cupy.empty((x.shape[0], x.shape[1], W.shape[1]), dtype=numpy.float32) sgemm(handle, False, False, m, n, k, 1, W.data.ptr, ldb, x.data.ptr, lda, 0, Wx.data.ptr, ldc) if len(inputs) > 2: b = inputs[2] Wx += b return Wx,