def forward_gpu(self, x): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = self.W.shape[2] # When indices are enclosed with [], they are 'flatten' # (i.e. linealized as 1-D array) # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32) # '[ij],[ik]->[ijk]' cuda.elementwise( 'float* y, float* e1, float* e2, int e1c, int e2c', ''' int I = i / e1c / e2c; int J = (i - I * e1c * e2c) / e2c; int K = i % e2c; y[i] = e1[I * e1c + J] * e2[I * e2c + K]; ''', 'row_wise_outer_product')( e1e2, e1, e2, j_len, k_len) # [ijk]->i[jk] e1e2 = e1e2.reshape(i_len, j_len * k_len) # jkl->[jk]l W_mat = self.W.reshape( self.W.shape[0] * self.W.shape[1], self.W.shape[2]) y = cuda.empty((i_len, l_len), dtype=numpy.float32) with cuda.using_cumisc(): # 'i[jk],[jk]l->il' cuda.culinalg.dot(e1e2, W_mat, out=y) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,jl->il cuda.culinalg.add_dot(e1, self.V1, y) # ik,kl->il cuda.culinalg.add_dot(e2, self.V2, y) cuda.elementwise( 'float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def forward_gpu(self, x): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = self.W.shape[2] # When indices are enclosed with [], they are 'flatten' # (i.e. linealized as 1-D array) # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32) # '[ij],[ik]->[ijk]' cuda.elementwise( 'float* y, float* e1, float* e2, int e1c, int e2c', ''' int I = i / e1c / e2c; int J = (i - I * e1c * e2c) / e2c; int K = i % e2c; y[i] = e1[I * e1c + J] * e2[I * e2c + K]; ''', 'row_wise_outer_product')(e1e2, e1, e2, j_len, k_len) # [ijk]->i[jk] e1e2 = e1e2.reshape(i_len, j_len * k_len) # jkl->[jk]l W_mat = self.W.reshape(self.W.shape[0] * self.W.shape[1], self.W.shape[2]) y = cuda.empty((i_len, l_len), dtype=numpy.float32) with cuda.using_cumisc(): # 'i[jk],[jk]l->il' cuda.culinalg.dot(e1e2, W_mat, out=y) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,jl->il cuda.culinalg.add_dot(e1, self.V1, y) # ik,kl->il cuda.culinalg.add_dot(e2, self.V2, y) cuda.elementwise('float* y, float* b, int n_channel', 'y[i] += b[i % n_channel]', 'linear_bias')(y, self.b, self.b.size) return y,
def backward_gpu(self, x, gy): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = gy[0].shape[1] # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) gy, = gy # il->[il] gy_vec = array.as_vec(gy) # jkl->[jkl] W_vec = array.as_vec(self.W) dgW = cuda.empty((j_len * k_len * l_len,), dtype=numpy.float32) # '[ij],[ik],[il]->[jkl]' cuda.elementwise( ''' float* y, float* e1, float* e2, float* gy, int r, int e1c, int e2c, int gyc ''', ''' int J = i / e2c / gyc; int K = (i - J * e2c * gyc) / gyc; int L = i % gyc; float yval = 0; for (int I = 0; I < r; ++I) { int e1idx = I * e1c + J; int e2idx = I * e2c + K; int gyidx = I * gyc + L; yval += e1[e1idx] * e2[e2idx] * gy[gyidx]; } y[i] = yval; ''', 'sum_of_three_ary_tensor_product')( dgW, e1, e2, gy_vec, i_len, j_len, k_len, l_len) # [jkl]->jkl self.gW += dgW.reshape((j_len, k_len, l_len)) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,il->jl cuda.culinalg.add_dot(e1, gy, self.gV1, transa='T') # ik,il->kl cuda.culinalg.add_dot(e2, gy, self.gV2, transa='T') self.gb += cuda.cumisc.sum(gy, 0) ge1 = cuda.empty((i_len * j_len,), dtype=numpy.float32) # '[ik],[jkl],[il]->[ij]' cuda.elementwise( ''' float* y, float* e, float* W, float* gy, int ec, int gyc, int gec ''', ''' int I = i / gec; int J = i % gec; float yval = 0; for (int K = 0; K < ec; ++K) { for (int L = 0; L < gyc; ++L) { int eidx = I * ec + K; int Widx = J * ec * gyc + K * gyc + L; int gyidx = I * gyc + L; yval += e[eidx] * W[Widx] * gy[gyidx]; } } y[i] = yval; ''', 'ge_kernel')(ge1, e2, W_vec, gy_vec, k_len, l_len, j_len) # [ij]->ij ge1 = ge1.reshape(i_len, j_len) ge2 = cuda.empty((i_len * k_len,), dtype=numpy.float32) # '[ij],[jkl],[il]->[ik]' cuda.elementwise( ''' float* y, float* e, float* W, float* gy, int ec, int gyc, int gec ''', ''' int I = i / gec; int K = i % gec; float yval = 0; for (int J = 0; J < ec; ++J) { for (int L = 0; L < gyc; ++L) { int eidx = I * ec + J; int Widx = J * gec * gyc + K * gyc + L; int gyidx = I * gyc + L; yval += e[eidx] * W[Widx] * gy[gyidx]; } } y[i] = yval; ''', 'ge_kernel2')(ge2, e1, W_vec, gy_vec, j_len, l_len, k_len) # [ik]->ik ge2 = ge2.reshape(i_len, k_len) if not self.nobias: with cuda.using_cumisc(): # il,jl->ij cuda.culinalg.add_dot(gy, self.V1, ge1, transb='T') # il,kl->ik cuda.culinalg.add_dot(gy, self.V2, ge2, transb='T') return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))
def backward_gpu(self, x, gy): i_len, j_len = array.as_mat(x[0]).shape k_len = array.as_mat(x[1]).shape[1] l_len = gy[0].shape[1] # ij->[ij] e1 = array.as_vec(x[0]) # ik->[ik] e2 = array.as_vec(x[1]) gy, = gy # il->[il] gy_vec = array.as_vec(gy) # jkl->[jkl] W_vec = array.as_vec(self.W) dgW = cuda.empty((j_len * k_len * l_len, ), dtype=numpy.float32) # '[ij],[ik],[il]->[jkl]' cuda.elementwise( ''' float* y, float* e1, float* e2, float* gy, int r, int e1c, int e2c, int gyc ''', ''' int J = i / e2c / gyc; int K = (i - J * e2c * gyc) / gyc; int L = i % gyc; float yval = 0; for (int I = 0; I < r; ++I) { int e1idx = I * e1c + J; int e2idx = I * e2c + K; int gyidx = I * gyc + L; yval += e1[e1idx] * e2[e2idx] * gy[gyidx]; } y[i] = yval; ''', 'sum_of_three_ary_tensor_product')(dgW, e1, e2, gy_vec, i_len, j_len, k_len, l_len) # [jkl]->jkl self.gW += dgW.reshape((j_len, k_len, l_len)) if not self.nobias: e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) with cuda.using_cumisc(): # ij,il->jl cuda.culinalg.add_dot(e1, gy, self.gV1, transa='T') # ik,il->kl cuda.culinalg.add_dot(e2, gy, self.gV2, transa='T') self.gb += cuda.cumisc.sum(gy, 0) ge1 = cuda.empty((i_len * j_len, ), dtype=numpy.float32) # '[ik],[jkl],[il]->[ij]' cuda.elementwise( ''' float* y, float* e, float* W, float* gy, int ec, int gyc, int gec ''', ''' int I = i / gec; int J = i % gec; float yval = 0; for (int K = 0; K < ec; ++K) { for (int L = 0; L < gyc; ++L) { int eidx = I * ec + K; int Widx = J * ec * gyc + K * gyc + L; int gyidx = I * gyc + L; yval += e[eidx] * W[Widx] * gy[gyidx]; } } y[i] = yval; ''', 'ge_kernel')(ge1, e2, W_vec, gy_vec, k_len, l_len, j_len) # [ij]->ij ge1 = ge1.reshape(i_len, j_len) ge2 = cuda.empty((i_len * k_len, ), dtype=numpy.float32) # '[ij],[jkl],[il]->[ik]' cuda.elementwise( ''' float* y, float* e, float* W, float* gy, int ec, int gyc, int gec ''', ''' int I = i / gec; int K = i % gec; float yval = 0; for (int J = 0; J < ec; ++J) { for (int L = 0; L < gyc; ++L) { int eidx = I * ec + J; int Widx = J * gec * gyc + K * gyc + L; int gyidx = I * gyc + L; yval += e[eidx] * W[Widx] * gy[gyidx]; } } y[i] = yval; ''', 'ge_kernel2')(ge2, e1, W_vec, gy_vec, j_len, l_len, k_len) # [ik]->ik ge2 = ge2.reshape(i_len, k_len) if not self.nobias: with cuda.using_cumisc(): # il,jl->ij cuda.culinalg.add_dot(gy, self.V1, ge1, transb='T') # il,kl->ik cuda.culinalg.add_dot(gy, self.V2, ge2, transb='T') return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))