def forward_gpu(self, x): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = self.W.shape[2] # When indices are enclosed with [], they are 'flatten' # (i.e. linealized as 1-D array) # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32) # '[ij],[ik]->[ijk]' cuda.elementwise( 'float* y, float* e1, float* e2, int e1c, int e2c', ''' int I = i / e1c / e2c; int J = (i - I * e1c * e2c) / e2c; int K = i % e2c; y[i] = e1[I * e1c + J] * e2[I * e2c + K]; ''', 'row_wise_outer_product')( e1e2, e1, e2, j_len, k_len) # [ijk]->i[jk] e1e2 = e1e2.reshape(i_len, j_len * k_len) # jkl->[jk]l W_mat = self.W.reshape( self.W.shape[0] * self.W.shape[1], self.W.shape[2]) y = cuda.empty((i_len, l_len), dtype=numpy.float32) with cuda.using_cumisc(): # 'i[jk],[jk]l->il' cuda.culinalg.dot(e1e2, W_mat, out=y) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,jl->il cuda.culinalg.add_dot(e1, self.V1, y) # ik,kl->il cuda.culinalg.add_dot(e2, self.V2, y) cuda.elementwise( 'float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def backward_gpu(self, x, gy): glhs = None if self.lhs_bwd: with cuda.using_cumisc(): s = x[0].shape glhs = cuda.cumisc.sum(gy[0], 1).reshape(s) / s[1] return glhs, -gy[0]
def forward_gpu(self, x): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = self.W.shape[2] # When indices are enclosed with [], they are 'flatten' # (i.e. linealized as 1-D array) # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32) # '[ij],[ik]->[ijk]' cuda.elementwise( 'float* y, float* e1, float* e2, int e1c, int e2c', ''' int I = i / e1c / e2c; int J = (i - I * e1c * e2c) / e2c; int K = i % e2c; y[i] = e1[I * e1c + J] * e2[I * e2c + K]; ''', 'row_wise_outer_product')(e1e2, e1, e2, j_len, k_len) # [ijk]->i[jk] e1e2 = e1e2.reshape(i_len, j_len * k_len) # jkl->[jk]l W_mat = self.W.reshape(self.W.shape[0] * self.W.shape[1], self.W.shape[2]) y = cuda.empty((i_len, l_len), dtype=numpy.float32) with cuda.using_cumisc(): # 'i[jk],[jk]l->il' cuda.culinalg.dot(e1e2, W_mat, out=y) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,jl->il cuda.culinalg.add_dot(e1, self.V1, y) # ik,kl->il cuda.culinalg.add_dot(e2, self.V2, y) cuda.elementwise('float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def backward_gpu(self, x, gy): _x = _as_mat(x[0]) gx = cuda.empty_like(_x) with cuda.using_cumisc(): cuda.culinalg.add_dot(gy[0], _x, self.gW, transa='T') if self.gb is not None: self.gb += cuda.cumisc.sum(gy[0], 0) cuda.culinalg.dot(gy[0], self.W, out=gx) return gx.reshape(x[0].shape),
def forward_gpu(self, x): x = _as_mat(x[0]) y = cuda.empty((x.shape[0], self.W.shape[0]), dtype=x.dtype) with cuda.using_cumisc(): cuda.culinalg.dot(x, self.W, transb='T', out=y) if self.b is not None: cuda.elementwise('float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def _cumean_axis02(x): with cuda.using_cumisc(): if x.shape[2] > 1: # cumisc.mean does not support more than two dimensions shape = x.shape x = x.reshape(shape[0] * shape[1], shape[2]) x = cuda.cumisc.mean(x, axis=1) x = x.reshape(shape[0], shape[1]) else: x = x.reshape(x.shape[:2]) return cuda.cumisc.mean(x, axis=0)
def _matmul_gpu(a, b, transa=False, transb=False, transout=False, out=None): if transout: # (A B)^T = B^T A^T a, b, transa, transb = b, a, not transb, not transa a = _as_mat(a) b = _as_mat(b) with cuda.using_cumisc(): return cuda.culinalg.dot(a, b, transa=_as_trans_op(transa), transb=_as_trans_op(transb), out=out)
def forward_gpu(self, x): x = _as_mat(x[0]) y = cuda.empty((x.shape[0], self.W.shape[0]), dtype=x.dtype) with cuda.using_cumisc(): cuda.culinalg.dot(x, self.W, transb='T', out=y) if self.b is not None: cuda.elementwise( 'float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], h, w) gy_desc = cudnn.get_tensor_desc(gy[0], out_h, out_w) if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) libcudnn.cudnnConvolutionBackwardFilter( handle, 1, x_desc.value, cudnn.get_ptr(x[0]), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) gx = cuda.empty_like(x[0]) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 0, x_desc.value, cudnn.get_ptr(gx)) else: handle = cuda.get_cublas_handle() if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * out_c, out_h * out_w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, out_c), axis=0) self.gb += tmp # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def backward_gpu(self, inputs, grad_outputs): ngauss = self.y.shape[1] gradients = tuple(cuda.empty_like(i) for i in inputs[:-2]) # w/o (x1, x2) args = gradients + inputs ysum = None with cuda.using_cumisc(): ysum = cuda.cumisc.sum(self.y, 1).reshape((self.y.shape[0], 1)) cuda.elementwise( ''' const float* r, const float* rsum, const int ngauss, float* gw, float* gm1, float* gm2, float* gs1, float* gs2, float* gc, const float* w, const float* m1, const float* m2, const float* s1, const float* s2, const float* c, const float* x1, const float* x2 ''', ''' const int j = i / ngauss; const float z1 = (x1[j] - m1[i]) / s1[i]; const float z2 = (x2[j] - m2[i]) / s2[i]; const float z3 = 1.0f / (1.0f - pow(c[i], 2.0f)); const float z4 = pow(z1 - c[i] * z2, 2.0f); const float z5 = - r[i] / rsum[j]; gw[i] = w[i] + z5; gm1[i] = z3 / s1[i] * (z1 - c[i] * z2); gm2[i] = z3 / s2[i] * (z2 - c[i] * z1); gs1[i] = (x1[j] - m1[i]) * gm1[i] - 1.0f; gs2[i] = (x2[j] - m2[i]) * gm2[i] - 1.0f; gc[i] = z1 * z2 + c[i] * (1.0f - z3 * z4); gm1[i] *= z5; gm2[i] *= z5; gs1[i] *= z5; gs2[i] *= z5; gc[i] *= z5; ''', 'gaussian_mixture_2d_bwd' )(self.y, ysum, ngauss, *args) # tuple(- self.y * g for g in gradients[1:]) return gradients + (None, None) # for target signals
def backward_gpu(self, x, gy): ldim = x[0].shape[0] cdim = self.W.size rdim = x[0].size // (ldim * cdim) masked = cuda.empty_like(x[0]) cuda.elementwise( "float* masked, const float* x, const float* gy", "masked[i] = x[i] >= 0 ? 0 : x[i] * gy[i]", "prelu_masked" )(masked, x[0], gy[0]) with cuda.using_cumisc(): rsum = cuda.cumisc.sum(masked.reshape(ldim * cdim, rdim), axis=1) gW = cuda.cumisc.sum(rsum.reshape(ldim, cdim), axis=0) self.gW += gW.reshape(self.gW.shape) del rsum, gW gx = masked # reuse buffer _fwd_kern()(gx, gy[0], x[0], self.W, cdim, rdim) return (gx,)
def backward_gpu(self, x, gy): ldim = x[0].shape[0] cdim = self.W.size rdim = x[0].size // (ldim * cdim) masked = cuda.empty_like(x[0]) cuda.elementwise('float* masked, const float* x, const float* gy', 'masked[i] = x[i] >= 0 ? 0 : x[i] * gy[i]', 'prelu_masked')(masked, x[0], gy[0]) with cuda.using_cumisc(): rsum = cuda.cumisc.sum(masked.reshape(ldim * cdim, rdim), axis=1) gW = cuda.cumisc.sum(rsum.reshape(ldim, cdim), axis=0) self.gW += gW.reshape(self.gW.shape) del rsum, gW gx = masked # reuse buffer _fwd_kern()(gx, gy[0], x[0], self.W, cdim, rdim) return gx,
def _cusum_axis02(x, y=None, expr1='x[I]', expr2='x[I] * x[I]', mean=False): with cuda.using_cumisc(): shape = x.shape ret1 = cuda.empty_like(x[0]) ret2 = cuda.empty_like(x[0]) if y is None: y = x alpha = 1.0 if mean: alpha = 1.0 / (shape[0] * shape[2]) # In most cases shape[0] is constant. # Therefore, the kernel is compiled only once. # If shape[0] is small, Compiler will perform loop unrolling. _create_reduction_kernel(shape[0], expr1, expr2)( ret1, ret2, x, y, alpha, shape[1] * shape[2]) if shape[2] != 1: ret1 = _partial_reduce(ret1) ret2 = _partial_reduce(ret2) ret_shape = (1, shape[1], 1) return (ret1.reshape(ret_shape), ret2.reshape(ret_shape))
def backward_gpu(self, x, gy): n, out_c, out_h, out_w = x[0].shape c, h, w = gy[0].shape[1:] gx = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() gy_desc = cudnn.get_tensor_desc(gy[0], h, w) gx_desc = cudnn.get_tensor_desc(gx, out_h, out_w) algo = libcudnn.cudnnGetConvolutionForwardAlgorithm( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, _fwd_pref, self.max_workspace_size) workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize( handle, gy_desc.value, self.filter_desc.value, self.conv_desc.value, gx_desc.value, algo).value workspace = cuda.empty( (max(workspace_size // 4, 1),), dtype=numpy.float32) libcudnn.cudnnConvolutionForward( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), self.filter_desc.value, cudnn.get_ptr(self.W), self.conv_desc.value, algo, cudnn.get_ptr( workspace), workspace_size, 0, gx_desc.value, cudnn.get_ptr(gx)) # bias backward if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) # filter backward libcudnn.cudnnConvolutionBackwardFilter( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), gx_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) else: # Implementation using im2col col = conv.im2col_gpu( gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) # TODO(beam2d): Use streams handle = cuda.get_cublas_handle() W_mat = self.W.reshape(out_c, c * self.kh * self.kw) col_mats = col.reshape( n, c * self.kh * self.kw, out_h * out_w) gx_mats = gx.reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, col_mats[i], handle=handle, out=gx_mats[i]) # bias backward if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * c, h * w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, c), axis=0) self.gb += tmp # filter backward # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) x_mats = x[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( x_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) return gx,
def forward_gpu(self, inputs): with cuda.using_cumisc(): return cuda.cumisc.sum(inputs[0], 1).reshape((inputs[0].shape[0], 1)),
def backward_gpu(self, x, gy): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = gy[0].shape[1] # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) gy, = gy # il->[il] gy_vec = array.as_vec(gy) # jkl->[jkl] W_vec = array.as_vec(self.W) dgW = cuda.empty((j_len * k_len * l_len, ), dtype=numpy.float32) # '[ij],[ik],[il]->[jkl]' cuda.elementwise( ''' float* y, float* e1, float* e2, float* gy, int r, int e1c, int e2c, int gyc ''', ''' int J = i / e2c / gyc; int K = (i - J * e2c * gyc) / gyc; int L = i % gyc; float yval = 0; for (int I = 0; I < r; ++I) { int e1idx = I * e1c + J; int e2idx = I * e2c + K; int gyidx = I * gyc + L; yval += e1[e1idx] * e2[e2idx] * gy[gyidx]; } y[i] = yval; ''', 'sum_of_three_ary_tensor_product')(dgW, e1, e2, gy_vec, i_len, j_len, k_len, l_len) # [jkl]->jkl self.gW += dgW.reshape((j_len, k_len, l_len)) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,il->jl cuda.culinalg.add_dot(e1, gy, self.gV1, transa='T') # ik,il->kl cuda.culinalg.add_dot(e2, gy, self.gV2, transa='T') self.gb += cuda.cumisc.sum(gy, 0) ge1 = cuda.empty((i_len * j_len, ), dtype=numpy.float32) # '[ik],[jkl],[il]->[ij]' cuda.elementwise( ''' float* y, float* e, float* W, float* gy, int ec, int gyc, int gec ''', ''' int I = i / gec; int J = i % gec; float yval = 0; for (int K = 0; K < ec; ++K) { for (int L = 0; L < gyc; ++L) { int eidx = I * ec + K; int Widx = J * ec * gyc + K * gyc + L; int gyidx = I * gyc + L; yval += e[eidx] * W[Widx] * gy[gyidx]; } } y[i] = yval; ''', 'ge_kernel')(ge1, e2, W_vec, gy_vec, k_len, l_len, j_len) # [ij]->ij ge1 = ge1.reshape(i_len, j_len) ge2 = cuda.empty((i_len * k_len, ), dtype=numpy.float32) # '[ij],[jkl],[il]->[ik]' cuda.elementwise( ''' float* y, float* e, float* W, float* gy, int ec, int gyc, int gec ''', ''' int I = i / gec; int K = i % gec; float yval = 0; for (int J = 0; J < ec; ++J) { for (int L = 0; L < gyc; ++L) { int eidx = I * ec + J; int Widx = J * gec * gyc + K * gyc + L; int gyidx = I * gyc + L; yval += e[eidx] * W[Widx] * gy[gyidx]; } } y[i] = yval; ''', 'ge_kernel2')(ge2, e1, W_vec, gy_vec, j_len, l_len, k_len) # [ik]->ik ge2 = ge2.reshape(i_len, k_len) if not self.nobias: with cuda.using_cumisc(): # il,jl->ij cuda.culinalg.add_dot(gy, self.V1, ge1, transb='T') # il,kl->ik cuda.culinalg.add_dot(gy, self.V2, ge2, transb='T') return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))
def backward_gpu(self, x, gy): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = gy[0].shape[1] # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) gy, = gy # il->[il] gy_vec = array.as_vec(gy) # jkl->[jkl] W_vec = array.as_vec(self.W) dgW = cuda.empty((j_len * k_len * l_len,), dtype=numpy.float32) # '[ij],[ik],[il]->[jkl]' cuda.elementwise( ''' float* y, float* e1, float* e2, float* gy, int r, int e1c, int e2c, int gyc ''', ''' int J = i / e2c / gyc; int K = (i - J * e2c * gyc) / gyc; int L = i % gyc; float yval = 0; for (int I = 0; I < r; ++I) { int e1idx = I * e1c + J; int e2idx = I * e2c + K; int gyidx = I * gyc + L; yval += e1[e1idx] * e2[e2idx] * gy[gyidx]; } y[i] = yval; ''', 'sum_of_three_ary_tensor_product')( dgW, e1, e2, gy_vec, i_len, j_len, k_len, l_len) # [jkl]->jkl self.gW += dgW.reshape((j_len, k_len, l_len)) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,il->jl cuda.culinalg.add_dot(e1, gy, self.gV1, transa='T') # ik,il->kl cuda.culinalg.add_dot(e2, gy, self.gV2, transa='T') self.gb += cuda.cumisc.sum(gy, 0) ge1 = cuda.empty((i_len * j_len,), dtype=numpy.float32) # '[ik],[jkl],[il]->[ij]' cuda.elementwise( ''' float* y, float* e, float* W, float* gy, int ec, int gyc, int gec ''', ''' int I = i / gec; int J = i % gec; float yval = 0; for (int K = 0; K < ec; ++K) { for (int L = 0; L < gyc; ++L) { int eidx = I * ec + K; int Widx = J * ec * gyc + K * gyc + L; int gyidx = I * gyc + L; yval += e[eidx] * W[Widx] * gy[gyidx]; } } y[i] = yval; ''', 'ge_kernel')(ge1, e2, W_vec, gy_vec, k_len, l_len, j_len) # [ij]->ij ge1 = ge1.reshape(i_len, j_len) ge2 = cuda.empty((i_len * k_len,), dtype=numpy.float32) # '[ij],[jkl],[il]->[ik]' cuda.elementwise( ''' float* y, float* e, float* W, float* gy, int ec, int gyc, int gec ''', ''' int I = i / gec; int K = i % gec; float yval = 0; for (int J = 0; J < ec; ++J) { for (int L = 0; L < gyc; ++L) { int eidx = I * ec + J; int Widx = J * gec * gyc + K * gyc + L; int gyidx = I * gyc + L; yval += e[eidx] * W[Widx] * gy[gyidx]; } } y[i] = yval; ''', 'ge_kernel2')(ge2, e1, W_vec, gy_vec, j_len, l_len, k_len) # [ik]->ik ge2 = ge2.reshape(i_len, k_len) if not self.nobias: with cuda.using_cumisc(): # il,jl->ij cuda.culinalg.add_dot(gy, self.V1, ge1, transb='T') # il,kl->ik cuda.culinalg.add_dot(gy, self.V2, ge2, transb='T') return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))