def read(self, images, center_y, center_x, delta, sigma): """ Parameters ---------- images : T.matrix (shape: batch_size x img_size) Batch of images. Internally it will be reshaped to be a (batch_size, img_height, img_width)-shaped stack of images. center_y : T.vector (shape: batch_size) center_x : T.vector (shape: batch_size) delta : T.vector (shape: batch_size) sigma : T.vector (shape: batch_size) Returns ------- window : T.matrix (shape: batch_size x N**2) """ N = self.N batch_size = images.shape[0] # Reshape input into proper 2d images I = images.reshape( (batch_size, self.img_height, self.img_width) ) # Get separable filterbank FY, FX = self.filterbank_matrices(center_y, center_x, delta, sigma) # apply to the batch of images W = T.batched_dot(T.batched_dot(FY, I), FX.transpose([0,2,1])) return W.reshape((batch_size, N*N))
def compute_psi2(lls, lsf, z, input_means, input_vars): ls = T.exp(lls) sf = T.exp(lsf) b = ls / casting(2.0) term_1 = T.prod(T.sqrt(b / (b + input_vars)), 1) scale = T.sqrt(4 * (2 * b[ None, : ] + 0 * input_vars)) scaled_z = z[ None, : , : ] / scale[ : , None , : ] scaled_z_minus_m = scaled_z r2b = T.sum(scaled_z_minus_m**2, 2)[ :, None, : ] + T.sum(scaled_z_minus_m**2, 2)[ :, : , None ] - \ 2 * T.batched_dot(scaled_z_minus_m, np.transpose(scaled_z_minus_m, [ 0, 2, 1 ])) term_2 = T.exp(-r2b) scale = T.sqrt(4 * (2 * b[ None, : ] + 2 * input_vars)) scaled_z = z[ None, : , : ] / scale[ : , None , : ] scaled_m = input_means / scale scaled_m = T.tile(scaled_m[ : , None, : ], [ 1, z.shape[ 0 ], 1]) scaled_z_minus_m = scaled_z - scaled_m r2b = T.sum(scaled_z_minus_m**2, 2)[ :, None, : ] + T.sum(scaled_z_minus_m**2, 2)[ :, : , None ] + \ 2 * T.batched_dot(scaled_z_minus_m, np.transpose(scaled_z_minus_m, [ 0, 2, 1 ])) term_3 = T.exp(-r2b) psi2_computed = sf**casting(2.0) * term_1[ :, None, None ] * term_2 * term_3 return T.transpose(psi2_computed, [ 1, 2, 0 ])
def energy(self): rho_x = rho(self.x) rho_h = rho(self.h) squared_norm = ( T.batched_dot(self.x,self.x) + T.batched_dot(self.h,self.h) ) / 2 uni_terms = - T.dot(rho_x, self.bx) - T.dot(rho_h, self.bh) bi_terms = - T.batched_dot( T.dot(rho_x, self.W1), rho_h ) return squared_norm + uni_terms + bi_terms
def fwd(self, x, V, A, L): """ x : signal V : eigenvectors A : area L : eigenvalues """ V = V[:,:self.K] L = L[:self.K] L = L.dimshuffle('x','x',0) rho = T.sqrt(T.sum(A)) # Q x 1 x K, a window for each input function ghat = self.activation_interp( T.batched_dot(T.tile(L, [self.nin,1,1]), self.Winterp)) # Q x K x N V_ = T.tile(V.dimshuffle('x',1,0), [self.nin, 1, 1]) # Q x K x N tmp = (ghat * V).dimshuffle(0,2,1) # Q x N x N transl = rho * T.batched_dot(V_.dimshuffle(0,2,1), tmp) transl = A.dimshuffle('x',0,'x') * transl # Q x K x N tmp = (V.dimshuffle(0,'x',1) * x.dimshuffle(0,1,'x')).dimshuffle(1,2,0) # Q x K x N desc = rho * T.batched_dot(tmp, transl) desc = T.abs_(desc) desc = desc.dimshuffle(2,0,'x',1) # BC01 format : N x Q x 1 x K return self.activation(theano.tensor.nnet.conv.conv2d(desc, self.W).flatten(2) + self.b)
def one_step(self, l, images): ''' l = [n_examples, 5] image = [n_examples, height, width] ''' tol = 1e-4 g_x = self.B * (l[:, 0] + 1) / 2. g_y = self.A * (l[:, 1] + 1) / 2. delta = (max(self.A, self.B) - 1) / (self.N - 1) * T.exp(l[:, 2]) sigma = T.exp(l[:, 3]) mu_x = g_x.dimshuffle([0, 'x']) +\ (self.mu_ind - self.N / 2. + 0.5) * delta.dimshuffle([0, 'x']) mu_y = g_y.dimshuffle([0, 'x']) +\ (self.mu_ind - self.N / 2. + 0.5) * delta.dimshuffle([0, 'x']) F_x = T.exp(-((self.B_ind - mu_x.dimshuffle([0, 1, 'x']))**2) / ( 2 * (sigma.dimshuffle([0, 'x', 'x'])))**2) F_x = F_x / (F_x.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) # Compute Y filter banks## F_y = T.exp(-((self.A_ind - mu_y.dimshuffle([0, 1, 'x']))**2) / ( 2 * (sigma.dimshuffle([0, 'x', 'x'])))**2) F_y = F_y / (F_y.sum(axis=-1).dimshuffle(0, 1, 'x') + tol) read = T.batched_dot(T.batched_dot(F_y, images), F_x.dimshuffle([0, 2, 1])) return read, g_x, g_y, delta, sigma
def propup_given_h_lag(self, vt, h_lag, hbias): if h_lag == self.h0: x = T.batched_dot(vt, self.W) + T.addbroadcast( T.dot(h_lag, self.Wt) + hbias, 0, 1) else: x = T.batched_dot(vt, self.W) + hbias + T.dot(h_lag, self.Wt) return [x, T.nnet.sigmoid(x)]
def get_output_for(self, inputs, **kwargs): # seq_input: (batch_size, seq_size, n_hidden_con) # seq_mask: (batch_size, seq_size) # condition: (batch_size, n_hidden_con) seq_input, seq_mask, condition = inputs if self.gate_covariance: update = T.nnet.sigmoid( T.sum(seq_input * self.w_gate, axis=-1, keepdims=True) + self.b_gate) seq_input *= update length_seq = seq_input.shape[1] if self.covariance_decay: decay = T.arange(1, length_seq+1) decay = (self.covariance_decay + (length_seq-decay) * (1 - self.covariance_decay)) decay = T.sqrt(decay) decay = decay.dimshuffle('x', 0, 'x') seq_input *= decay seq_input *= T.shape_padright(seq_mask) # (batch_size, n_hidden_question, n_hidden_question) covariance = T.batched_dot(seq_input.dimshuffle(0, 2, 1), seq_input) # (batch_size, n_hidden_question), equivalent to the following line: # att = T.sum(covariance * condition.dimshuffle((0, 'x', 1)), axis=2) att = 1000 * T.batched_dot(covariance, condition.dimshuffle((0, 1))) if not self.covariance_decay: att /= T.sum(seq_mask, axis=1, keepdims=True) # norm2_att = T.sum(att * condition, axis=1, keepdims=True) # att = 1000 * att / norm2_att return att
def h_given_h_lag_vt(self, vt, h_lag, hbias): if h_lag == self.h0: x = T.batched_dot(vt, self.W) + T.addbroadcast( T.dot(h_lag, self.Wt) + hbias.dimshuffle('x', 0), 0, 1) else: x = T.batched_dot(vt, self.W) + \ T.dot(h_lag, self.Wt) + hbias.dimshuffle('x', 0) return [x, T.nnet.sigmoid(x)]
def energy_function(): squared_norm = ( T.batched_dot(self.x, self.x) + T.batched_dot(self.h, self.h) + T.batched_dot(self.y, self.y) ) / 2.0 uni_terms = -T.dot(self.rho_x, self.bx) - T.dot(self.rho_h, self.bh) - T.dot(self.rho_y, self.by) bi_terms = -T.batched_dot(T.dot(self.rho_x, self.W1), self.rho_h) - T.batched_dot( T.dot(self.rho_h, self.W2), self.rho_y ) return squared_norm + uni_terms + bi_terms
def defmodel(self): lhs = T.ivector("lhs") rhs, nrhs = T.ivectors("rhs","nrhs") lhsemb = self.entembs[lhs, :] rhsemb = self.W[rhs, :] nrhsemb = self.W[nrhs, :] pdot = T.batched_dot(lhsemb, rhsemb) ndot = T.batched_dot(lhsemb, nrhsemb) return pdot, ndot, [lhs, rhs, nrhs]
def step(self, x, states): h_tild_tm1 = states[0] B_U = states[1] B_W = states[2] if self.consume_less == 'cpu': x_i = x[:, :self.output_dim] x_f = x[:, self.output_dim: 2 * self.output_dim] x_c = x[:, 2 * self.output_dim: 3 * self.output_dim] x_o = x[:, 3 * self.output_dim: 4 * self.output_dim] x_new = x[:, 4 * self.output_dim:] else: x_i = K.dot(x * B_W[0], self.W_i) + self.b_i x_f = K.dot(x * B_W[1], self.W_f) + self.b_f x_c = K.dot(x * B_W[2], self.W_c) + self.b_c x_o = K.dot(x * B_W[3], self.W_o) + self.b_o x_new = x # self.C_tape -> BT, t-1, k # self.H_tape -> BT, t-1, k # x -> BT, k # h_tild_tm1 -> BT, k if self.H_tape is None: self.H_tape = K.zeros_like(h_tild_tm1).dimshuffle((0,'x',1)) self.C_tape = K.zeros_like(h_tild_tm1).dimshuffle((0,'x',1)) # s_t -> BT, t-1, 1 t = K.shape(self.C_tape)[1] sum1 = K.dot(self.H_tape, self.W_h) sum2 = K.dot(K.repeat_elements(x_new.dimshuffle((0,'x',1)),t, axis=1), self.W_x) sum3 = K.dot(K.repeat_elements(h_tild_tm1.dimshuffle((0,'x',1)),t, axis=1), self.W_h_tilde) tanhed_sum = K.tanh(sum1 + sum2 + sum3) a_t = K.dot(tanhed_sum, self.v)[:,:,0] s_t = K.softmax(a_t) h_tilde_t = T.batched_dot(self.H_tape.dimshuffle((0,2,1)), s_t.dimshuffle((0,1,'x')))[:,:,0] c_tilde_t = T.batched_dot(self.C_tape.dimshuffle((0,2,1)), s_t.dimshuffle((0,1,'x')))[:,:,0] i = self.inner_activation(x_i + K.dot(h_tilde_t * B_U[0], self.U_i)) f = self.inner_activation(x_f + K.dot(h_tilde_t * B_U[1], self.U_f)) c_t = f * c_tilde_t + i * self.activation(x_c + K.dot(h_tilde_t * B_U[2], self.U_c)) o = self.inner_activation(x_o + K.dot(h_tilde_t * B_U[3], self.U_o)) h_t = o * self.activation(c_t) # Add to Tape self.C_tape = K.concatenate([self.C_tape, c_t.dimshuffle((0,'x',1))], axis=1) self.H_tape = K.concatenate([self.H_tape, h_t.dimshuffle((0,'x',1))], axis=1) return h_t, [h_tilde_t]
def batched_cos_sim(s): """ from (x,y,z)-shaped pair, produce (x,y)-shaped pair that replaces the z-vector pairs by their cosine similarities """ import theano import theano.tensor as T return theano.scan( fn=lambda xm, ym: T.batched_dot(xm, ym) / T.sqrt(T.batched_dot(xm, xm) * T.batched_dot(ym, ym)), outputs_info=None, sequences=s, non_sequences=None, )[0]
def free_energy_given_hid_lag(self, vt, h_lag, hbias, vbias): if h_lag == self.h0: wx_b = T.batched_dot(vt, self.W) +\ T.addbroadcast(T.dot(h_lag, self.Wt) + hbias, 0, 1) vbias_term = T.batched_dot(vt, vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=2) else: wx_b = T.batched_dot(vt, self.W) + T.dot(h_lag, self.Wt) + \ hbias.dimshuffle('x', 0) vbias_term = T.batched_dot(vt, vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=2) return -hidden_term - vbias_term
def MemLayer(incomings, params, linear=0): ''' incomings = (u, u_shape, A, A_shape, C, C_shape) ''' ((u, u_shape), (A, A_shape), (C, C_shape)) = incomings p = T.switch(linear, T.batched_dot(A, u), nnet.softmax(T.batched_dot(A, u))) p_shape = A_shape[:2] # C.shape = (batch_size, num_sen, embed_size), u.shape = (batch_size, embed_size) # p.shape = (batch_size, num_sen, 1) #return (p, u_shape) O = (C * p[:, :, None]).sum(axis = 1) return ((O, u_shape), (p, p_shape))
def write(self, windows, center_y, center_x, delta, sigma): N = self.N batch_size = windows.shape[0] # Reshape input into proper 2d windows W = windows.reshape( (batch_size, N, N) ) # Get separable filterbank FY, FX = self.filterbank_matrices(center_y, center_x, delta, sigma) # apply... I = T.batched_dot(T.batched_dot(FY.transpose([0,2,1]), W), FX) return I.reshape( (batch_size, self.img_height*self.img_width) )
def factorization(self, batchSize, argsEmbA, argsEmbB, wC, wC1, wC2): # l = batchSize # k = self.k # embed size # r = self.r # relation number # argEmbedsA = self.A[argsA.flatten()] # [l,k] # argEmbedsB = self.A[argsB.flatten()] # [l,k] # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) # [l,r] * [k,k,r] = [l, k, k] Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # + self.Cb # [l, k, k] * [l, k] = [l, k] Asecond = T.batched_dot(Afirst, argsEmbB) # [l, k] * [l, k] = [l] # entropy = T.sum(T.log(relationProbs) * relationProbs, axis=1) # [l,r] * [l,r] = [l] spFirst = T.batched_dot(wC1, argsEmbA) spSecond = T.batched_dot(wC2, argsEmbB) return Asecond + spFirst + spSecond
def _initialize_posterior_distribution(self, RecognitionParams): # Now actually compute the precisions (from their square roots) self.Lambda = T.batched_dot(self.LambdaChol, self.LambdaChol.dimshuffle(0,2,1)) # dynamics matrix & initialize the innovations precision, xDim x xDim self.A = theano.shared(value=RecognitionParams['A'].astype(theano.config.floatX) ,name='A' ) self.QinvChol = theano.shared(value=RecognitionParams['QinvChol'].astype(theano.config.floatX) ,name='QinvChol' ) self.Q0invChol = theano.shared(value=RecognitionParams['Q0invChol'].astype(theano.config.floatX),name='Q0invChol') self.Qinv = T.dot(self.QinvChol,self.QinvChol.T) self.Q0inv = T.dot(self.Q0invChol,self.Q0invChol.T) ################## put together the total precision matrix ###################### AQinvA = T.dot(T.dot(self.A.T, self.Qinv), self.A) # for now we (suboptimally) replicate a bunch of times AQinvrep = Tsla.kron(T.ones([self.Tt-1,1,1]),-T.dot(self.A.T, self.Qinv)) # off-diagonal blocks (upper triangle) AQinvArep = Tsla.kron(T.ones([self.Tt-2,1,1]), AQinvA+self.Qinv) AQinvArepPlusQ = T.concatenate([T.shape_padleft(self.Q0inv + AQinvA), AQinvArep, T.shape_padleft(self.Qinv)]) # This is our inverse covariance matrix: diagonal (AA) and off-diagonal (BB) blocks. self.AA = self.Lambda + AQinvArepPlusQ self.BB = AQinvrep # symbolic recipe for computing the the diagonal (V) and # off-diagonal (VV) blocks of the posterior covariance self.V, self.VV, self.S = compute_sym_blk_tridiag(self.AA, self.BB) # now compute the posterior mean LambdaMu = T.batched_dot(self.Lambda, self.Mu) # scale by precision (no need for transpose; lambda is symmetric) #self.old_postX = compute_sym_blk_tridiag_inv_b(self.S,self.V,LambdaMu) # apply inverse # compute cholesky decomposition self.the_chol = blk_tridag_chol(self.AA, self.BB) # intermediary (mult by R^T) - ib = blk_chol_inv(self.the_chol[0], self.the_chol[1], LambdaMu) # final result (mult by R)- self.postX = blk_chol_inv(self.the_chol[0], self.the_chol[1], ib, lower=False, transpose=True) # The determinant of the covariance is the square of the determinant of the cholesky factor. # Determinant of the Cholesky factor is the product of the diagonal elements of the block-diagonal. def comp_log_det(L): return T.log(T.diag(L)).sum() self.ln_determinant = -2*theano.scan(fn=comp_log_det, sequences=self.the_chol[0])[0].sum()
def attention_decoder_calc(prefix, params, layer_setting, h_e, mask_below, state_below, h_init = None, c_init = None, mask = None, training = True): [h_d, c_d] = lstm_calc(prefix+'_lstm', params, layer_setting['_lstm'], state_below, h_init, c_init, mask, training = training) alpha = attention_calc(prefix+'_attention', params, layer_setting['_attention'], h_d, h_e) context = T.batched_dot(alpha.dimshuffle(1,0,2), h_e.dimshuffle(1,0,2)).dimshuffle(1,0,2) h_d2 = feedforward_calc(prefix+'_tanh', params, layer_setting['_tanh'], T.concatenate([h_d, context], axis = 2)) dist = feedforward_calc(prefix+'_softmax', params, layer_setting['_softmax'], h_d2) return h_d, c_d, dist, alpha
def __init__(self, input_group, n_in_list, emb_dim): print input_group self.n_in_list = n_in_list self.n_out = emb_dim Xs = [] Ws = [] bs = [] outs = [] self.Ws = Ws self.bs = bs self.Xs = Xs self.outs = outs for i, input in enumerate(input_group): x = input w = theano.shared(value=(numpy.random.rand(n_in_list[i], emb_dim)-0.5), borrow=True) b = theano.shared(value=numpy.random.rand(emb_dim), borrow=True) Xs.append( x ) Ws.append( w ) bs.append( b ) outs.append( T.dot(x, w) + b ) #### Active function #### # TODO: just support dot(Xs[0], Xs[1]) now. if len(Xs)!=2: raise Exception('Just support 2 input group now.') self.Y = T.batched_dot( outs[0], outs[1] ) # Function Definition. self.active = theano.function(Xs, self.Y)
def get_output_for(self, inputs, **kwargs): assert len(inputs) == 4 context, question, c_mask, q_mask = inputs batch_size, question_len, emb_size = question.shape question = question.reshape( (batch_size * question_len, emb_size)) * self.V question = question.reshape((batch_size, question_len, emb_size)) # batch_size x emb_size x context_len context = context.dimshuffle(0, 2, 1) # batch_size x question_len x context_len x = T.batched_dot(question, context) x_max = x.max(axis=2).dimshuffle(0, 1, 'x') esim = T.exp(x - x_max) esim *= c_mask.reshape((batch_size, 1, -1)) sums = esim.sum(axis=2) esim /= sums.dimshuffle(0, 1, 'x') esim *= q_mask.reshape((batch_size, -1, 1)) return esim.sum(axis=1) # batch_size x context_len
def cosine_similarity(x, y, eps=1e-6): z = T.batched_dot(x, y.dimshuffle(0, 2, 1)) z /= T.sqrt( T.sum(x * x, axis=2).dimshuffle(0, 1, 'x') * T.sum(y * y, axis=2).dimshuffle(0, 'x', 1) + eps) return z
def rightMostFactorization(self, batchSize, args, wC2): l = batchSize k = self.k # embed size r = self.r # relation number argEmbeds2 = self.A[args.flatten()] Asecond = T.batched_dot(wC2, argEmbeds2) return Asecond
def leftMostFactorization(self, batchSize, args, wC1): l = batchSize k = self.k # embed size r = self.r # relation number argEmbeds = self.A[args.flatten()] Afirst = T.batched_dot(wC1, argEmbeds) return Afirst
def getDM_score(kb_entities, kb_relations, neg_samples_kb, opts): neg_samples = opts.neg_samples vect_dim = opts.vect_dim num_entities = opts.num_entities num_relations = opts.num_relations l2_reg_entities = opts.l2_entity l2_reg_relations = opts.l2_relation ''' while reading some models are stores with entity embeddings named as entity_embeddings, and some as entity_embeddings_DM ''' entities = Embedding(output_dim=vect_dim, input_dim=num_entities+1, init='normal',name = 'entity_embeddings', W_regularizer=l2(l2_reg_entities)) relations = Embedding(output_dim=vect_dim, input_dim=num_relations, input_length=1,init='normal', name='relation_embeddings', W_regularizer=l2(l2_reg_relations)) entity_vectors = entities(kb_entities) entity_negative_vectors = entities(neg_samples_kb) relation_vectors = Flatten()(relations(kb_relations)) get_cross_1 = get_cross(0, neg_samples) get_cross_2 = get_cross(1, neg_samples) e1_cross_e2_prime = merge([entity_vectors, entity_negative_vectors], mode = get_cross_1, output_shape = (neg_samples, vect_dim)) e1_prime_cross_e2 = merge([entity_vectors, entity_negative_vectors], mode = get_cross_2, output_shape = (neg_samples, vect_dim)) e1_cross_e2 = Lambda(cross_e1_e2, output_shape = (vect_dim,))(entity_vectors) score_DM = merge([relation_vectors, e1_cross_e2], mode = lambda X : T.batched_dot(X[0], X[1]), output_shape=()) score_DM_e2_corrupted = merge([relation_vectors, e1_cross_e2_prime], mode = 'dot', output_shape=(neg_samples,), dot_axes=(1,2)) score_DM_e1_corrupted = merge([relation_vectors, e1_prime_cross_e2], mode = 'dot', output_shape=(neg_samples,), dot_axes=(1,2)) return score_DM, score_DM_e1_corrupted, score_DM_e2_corrupted
def step(self, x, states): h_tm1 = states[0] c_tm1 = states[1] h_tilde = x[:,0,:] L = K.params['xmaxlen'] M = K.tanh(self.precompute_W_y_y + x + K.repeat_elements(K.dot(h_tm1, self.U_r).dimshuffle((0,'x',1)),L, axis=1)) alpha = K.dot(M, self.W) alpha = K.softmax(alpha[:,:,0]) alpha = alpha.dimshuffle((0,'x',1)) output = T.batched_dot(alpha,self.Y) output = output[:,0,:] xt = K.concatenate([h_tilde,output],axis = 1) it = K.sigmoid( K.dot(xt,self.W_i) + K.dot(h_tilde,self.U_i) + self.b_i ) ft = K.sigmoid(K.dot(xt,self.W_f) + K.dot(h_tilde,self.U_f) + self.b_f) ot = K.sigmoid(K.dot(xt,self.W_o) + K.dot(h_tilde,self.U_o) + self.b_o) c_tilde_t = K.dot(xt,self.W_c) + K.dot(h_tilde,self.U_c) + self.b_c c_t = ft * c_tm1 + it*K.tanh( c_tilde_t ) h_t = ot * K.tanh(c_t) return h_t, [h_t,c_t]
def _fprop_step(state_below, state_below_in, state_below_z, state_below_r, state_before, W_recurrent, W_in, b, W_z, U_z, b_z, W_r, U_r, b_r): print "state before 1", state_before, state_before.dtype, state_before.type, state_before.broadcastable #state_before = tensor.unbroadcast(state_before, 0) z = tensor.nnet.sigmoid(state_below_z + tensor.dot(state_before, U_z) + b_z) r = tensor.nnet.sigmoid(state_below_r + tensor.dot(state_before, U_r) + b_r) #print "r dim", r.type.ndim #W_rec = self.project1(W_recurrent, state_below) print "State below step", state_below, state_below.broadcastable, state_below.ndim print "state before 2", state_before, state_before.dtype, state_before.type, state_before.broadcastable W_rec = W_recurrent[state_below] bias = b[state_below] # !!! Move to efficient indexing #shape = (state_below.shape[0], state_below.shape[1], self.dim) pre_h = ( state_below_in + r * tensor.batched_dot(state_before, W_rec)#.reshape(shape) + bias ) print "pre_h dim", pre_h, pre_h.type.ndim #print "W_recurrent[state_below] dim", W_rec, W_rec.ndim # print "W_rec * state before", (state_before* W_rec).ndim new_h = tensor.tanh(pre_h) #print "new_h", new_h h = z * state_before + (1. - z) * new_h print "final h dim", h, h.type, h.broadcastable, h.ndim h = tensor.unbroadcast(h, 0) return h
def step(mask, alpha_pre, s_pre, h_pre, c_pre): score = T.dot(h_pre, params[join(prefix, 'W_h')]) score = state_below + score[None, :, :] score = T.dot(T.tanh(score), params[join( prefix, 'W_f2')]) + params[join(prefix, 'b_f2')] shp = score.shape alpha = softmax_mask(T.reshape(score, [shp[1], shp[0]], ndim=2), inputMask.dimshuffle(1, 0)) context = T.batched_dot(alpha.dimshuffle(0, 'x', 1), reference.dimshuffle(1, 0, 2)).dimshuffle( 0, 2, ) activation = T.dot(h_pre, params[join(prefix, 'U')]) activation += T.dot(context, params[join(prefix, 'W')]) + params[join( prefix, 'b')] activation_i = slice(activation, 0, n_out) activation_f = slice(activation, 1, n_out) activation_c = slice(activation, 2, n_out) activation_o = slice(activation, 3, n_out) i = sigmoid(activation_i) f = sigmoid(activation_f) o = sigmoid(activation_o) c = f * c_pre + i * tanh(activation_c) c = mask[:, None] * c + (1 - mask)[:, None] * c_pre h = o * tanh(c) h = mask[:, None] * h + (1 - mask)[:, None] * h_pre return alpha, context, h, c
def apply(self, doc, query, mask_, batch_size): # batch_size x doc_length x hidden_dim mask = mask_.flatten() att1 = self.image_embed.apply(doc) # y_q_i: the ith token of question # batch_size x feature_dim # r_1: r_m_1 # batch_size x feature_dim # y_d: document # batch_size x doc_length x feature_dim # y_d_m: d-to-m # batch_size x doc_length x hidden_dim # batch_size x hidden_dim # batch_size x hidden_dim y_d = doc att3 = self.word_embed.apply(query) att = att1 + att3.dimshuffle(0, 'x', 1) # batch_size x doc_length x hidden_dim m = T.tanh(att) # batch_size x doc_length x 1 s = self.m_to_s.apply(m) # batch_size x doc_length s = s.reshape((s.shape[0], s.shape[1])) s = self.attention_dist.apply(s) y_d_s = y_d.swapaxes(1, 2) # return batch_size x feature_dim r = T.batched_dot(y_d_s, s) # batch_size x output_dim return r
def quadratic_saturating_loss(mx, Sx, target, Q, *args, **kwargs): ''' Squashing loss penalty function c(x) = ( 1 - e^(-0.5*quadratic_loss(x, target)) ) ''' if Sx is None: if mx.ndim == 1: mx = mx[None, :] delta = mx - target[None, :] deltaQ = delta.dot(Q) cost = 1.0 - tt.exp(-0.5 * tt.batched_dot(deltaQ, delta)) return cost else: # stochastic case (moment matching) delta = mx - target SxQ = Sx.dot(Q) EyeM = tt.eye(mx.shape[0]) IpSxQ = EyeM + SxQ Ip2SxQ = EyeM + 2 * SxQ S1 = tt.dot(Q, matrix_inverse(IpSxQ)) S2 = tt.dot(Q, matrix_inverse(Ip2SxQ)) # S1 = solve(IpSxQ.T, Q.T).T # S2 = solve(Ip2SxQ.T, Q.T).T # mean m_cost = -tt.exp(-0.5 * delta.dot(S1).dot(delta)) / tt.sqrt(det(IpSxQ)) # var s_cost = tt.exp(-delta.dot(S2).dot(delta)) / tt.sqrt( det(Ip2SxQ)) - m_cost**2 return 1.0 + m_cost, s_cost
def forward(self): z = self.z0 # sxd u = self.u_ # d w = self.w_ # d b = self.b # . h = self.h # f # h(sxd \dot d + .) = s if not self.batched: hwz = h(z.dot(w) + b) # s # sxd + (s \outer d) = sxd z1 = z + tt.outer(hwz, u) # sxd return z1 else: z = z.swapaxes(0, 1) # z bxsxd # u bxd # w bxd b = b.dimshuffle(0, "x") # b bx- hwz = h(tt.batched_dot(z, w) + b) # bxs # bxsxd + (bxsx- * bx-xd) = bxsxd hwz = hwz.dimshuffle(0, 1, "x") # bxsx- u = u.dimshuffle(0, "x", 1) # bx-xd z1 = z + hwz * u # bxsxd return z1.swapaxes(0, 1) # sxbxd
def make_layer(self, n_params, T_u, T_story, T_mask, rng): """ Inputs: network params (n_params) question vector (T_u) story tensor (T_story) Outputs: output vector (T_o) """ # ------ Encode encoder story data T_w2v_out = self.T_w2v[T_story] * T_mask[T_story] T_m = T.sum(T_w2v_out, axis=2) T_m_norm = T.sqrt(T.sum(T_m**2, axis=2)) T_m = T_m / (T_m_norm.dimshuffle(0, 1, 'x') + 1e-6) T_m = T.dot(T_m, n_params['T_B']) # ------ Encode decoder story data T_w2v_out = self.T_w2v[T_story] * T_mask[T_story] T_c = T.sum(T_w2v_out, axis=2) T_c_norm = T.sqrt(T.sum(T_c**2, axis=2)) T_c = T_c / (T_c_norm.dimshuffle(0, 1, 'x') + 1e-6) T_c = T.dot(T_c, n_params['T_B']) # ------ Sentence picker: tensor3-matrix product T_p = T.nnet.softmax(T.batched_dot(T_m, T_u)) # ------ Sum over story decoder T_p_2 = T_p.dimshuffle(0, 1, 'x') T_o = T.sum(T_p_2 * T_c, axis=1) # Collect return T_o, T_p
def sample_XY(self, X0data=None, Nsamps=1, Tbins=30, Xdata=None, withInflow=False): """ TODO: Write docstring """ if Xdata is None: Xdata = self.lat_ev_model.sample_X(X0data=X0data, Nsamps=Nsamps, Tbins=Tbins, withInflow=withInflow) else: Nsamps = Xdata.shape[0] Tbins = Xdata.shape[1] SigmaChol = T.tile(self.SigmaChol, (Nsamps * Tbins, 1, 1)) SigmaCholN = T.batched_dot(np.random.randn(Nsamps * Tbins, self.yDim), SigmaChol) Musymb = theano.clone(self.MuY, replace={self.X: Xdata}) Musymb = T.reshape(Musymb, (Nsamps * Tbins, self.yDim)) Ysymb = SigmaCholN + Musymb Ysymb = T.reshape(Ysymb, (Nsamps, Tbins, self.yDim)) Ydata = Ysymb.eval() return Ydata, Xdata
def call(self, input_tensors, mask=None): ''' wbw attention layer: :param ctxt (input_tensors[0]) : batch_size x T x ctxt_dim :param resp (input_tensors[1]) : batch_size x resp_dim ''' ctxt = input_tensors[0] resp = input_tensors[1] ctxt_mask = mask[0] resp_w = T.dot(resp, self.resp_dense) # bt_sz x dense_dim ctxt_w = T.dot(ctxt, self.ctxt_dense) # bt_sz x T x dense_dim resp_w_rep = resp_w[:, None, :] # bt_sz x T x dense_dim pre_alpha = T.tanh(ctxt_w + resp_w_rep) # bt_sz x T x dense_dim unnorm_alpha = T.dot(pre_alpha, self.alpha_dense).flatten(2) # bt_sz x T if ctxt_mask: unnorm_alpha_masked = unnorm_alpha - 1000 * (1. - ctxt_mask) else: unnorm_alpha_masked = unnorm_alpha alpha = T.nnet.softmax(unnorm_alpha_masked) # bt_sz x T attended_ctxt = T.batched_dot(alpha.dimshuffle((0, 'x', 1)), ctxt)[:, 0, :] # bt_sz x ctxt_dim if self.return_att: return [attended_ctxt, alpha] else: return attended_ctxt
def get_summary(self, yy): out = {} out['xsm'] = numpy.asarray(self.postX.eval({self.Input:yy}), dtype=theano.config.floatX) V = T.batched_dot(self.LambdaChol, self.LambdaChol.dimshuffle(0,2,1)) out['Vsm'] = numpy.asarray(V.eval({self.Input:yy}), dtype=theano.config.floatX) out['VVsm'] = np.zeros([yy.shape[0]-1, self.xDim, self.xDim]).astype(theano.config.floatX) return out
def get_output_for(self, inputs, **kwargs): q = self.q for i in range(self.hops): if self.fixed_query and not i: u = T.dot(inputs[0], q) else: u = T.batched_dot(inputs[0], q) # set masked positions to large negative value if len(inputs) > 1: u = u*inputs[1] - (1-inputs[1])*10000 #now batch_size x post_length x 1 but need to normalize via softmax # normalize over post_length (->large negative values = 0) u = T.reshape(u, (inputs[0].shape[0], inputs[0].shape[1])) alpha = T.nnet.softmax(u) #now B x S o = T.dot(T.sum(inputs[0] * alpha[:,:,None], axis=1), self.W_r) if self.fixed_query: q = q + o else: q = q + o return q
def _step(m_, x_, h_, U): #preact = tensor.dot(h_, U) #h_: n_p * n_samples * n_h #U: n_p * n_h * (k/h n_h) preact = tensor.batched_dot(h_, U[:, :, :2 * n_h]) preact += x_[:, :, :2 * n_h] z = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) r = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) m = tensor.tanh(x_[:, :, 2 * n_h:] + tensor.batched_dot(h_ * r, U[:, :, 2 * n_h:])) h = (1. - z) * h_ + z * m h = m_[:, :, None] * h + (1. - m_)[:, :, None] * h_ return h
def forward(self): z = self.z0 # sxd H = self.H # dxd if self.batched: return tt.batched_dot(z.swapaxes(0, 1), H).swapaxes(0, 1) else: return z.dot(H)
def get_output_for(self, inputs, **kwargs): # inputs[0]: B x N x D # inputs[1]: B x Q x D # self.mask: B x Q q_shuf = inputs[1].dimshuffle(0, 2, 1) # B x D x Q return T.batched_dot(inputs[0], q_shuf) # B x N x Q
def fwd_old(self, x, V, A, L): """ x : signal V : eigenvectors A : area L : eigenvalues """ V = V[:,:self.K] L = L[:self.K] # ghat is already a linear combination. it is faster than doing a # traslation and modulation each time of course, everything is linear # and thus it can be done ghat = self.sample_ghat(self.taus, L) rho = T.sqrt(T.sum(A)) trasl = rho * T.dot(V, ghat.dimshuffle(0,'x') * V.T) trasl = A.dimshuffle(0,'x') * trasl # size Q x K x N, intermediate N x Q x K tmp = (V.dimshuffle(0,'x',1) * x.dimshuffle(0,1,'x')).dimshuffle(1,2,0) trasl = T.tile(trasl.dimshuffle('x',0,1), [self.nin,1,1]) # size Q x K x N desc = rho * T.batched_dot(tmp, trasl) desc = T.abs_(desc) desc = desc.dimshuffle(2,0,'x',1) # BC01 format : N x Q x 1 x K return self.activation(theano.tensor.nnet.conv.conv2d(desc, self.W).flatten(2) + self.b)
def output_func(self, input): # P(Y|X) = softmax(W.X + b) q, a = input[0], input[1] # dot = T.batched_dot(q, T.batched_dot(a, self.W)) out = T.batched_dot(q, T.dot(a, self.W.T)).dimshuffle(0, 'x') return out
def output_func(self, input): # P(Y|X) = softmax(W.X + b) q, a = input[0], input[1] # dot = T.batched_dot(q, T.batched_dot(a, self.W)) dot = T.batched_dot(q, T.dot(a, self.W.T)) out = T.concatenate([dot.dimshuffle(0, 'x'), q, a], axis=1) return out
def factorization(self, batchSize, argsEmbA, argsEmbB, wC): # first = T.tensordot(relationProbs, self.C, axes=[[1], [2]]) # [l,r] * [k,k,r] = [l, k, k] Afirst = T.batched_tensordot(wC, argsEmbA, axes=[[1], [1]]) # [l, k, k] * [l, k] = [l, k] Asecond = T.batched_dot(Afirst, argsEmbB) # [l, k] * [l, k] = [l] # entropy = T.sum(T.log(relationProbs) * relationProbs, axis=1) # [l,r] * [l,r] = [l] return Asecond
def make_layer(self, n_params, T_u, T_story, T_mask, rng): """ Inputs: network params (n_params) question vector (T_u) story tensor (T_story) Outputs: output vector (T_o) """ # ------ Encode encoder story data T_w2v_out = self.T_w2v[T_story] * T_mask[T_story] T_m = T.sum(T_w2v_out, axis=2) T_m_norm = T.sqrt(T.sum(T_m ** 2, axis=2)) T_m = T_m / (T_m_norm.dimshuffle(0, 1, 'x') + 1e-6) T_m = T.dot(T_m, n_params['T_B']) # ------ Encode decoder story data T_w2v_out = self.T_w2v[T_story] * T_mask[T_story] T_c = T.sum(T_w2v_out, axis=2) T_c_norm = T.sqrt(T.sum(T_c ** 2, axis=2)) T_c = T_c / (T_c_norm.dimshuffle(0, 1, 'x') + 1e-6) T_c = T.dot(T_c, n_params['T_B']) # ------ Sentence picker: tensor3-matrix product T_p = T.nnet.softmax(T.batched_dot(T_m, T_u)) # ------ Sum over story decoder T_p_2 = T_p.dimshuffle(0, 1, 'x') T_o = T.sum(T_p_2 * T_c, axis=1) # Collect return T_o, T_p
def L_op(self, inputs, outputs, output_grads): # Gradients computed by Op assert self.compute_grad and len(outputs) == 2 gradients = outputs[1] assert gradients is not None # Gradients of original function, to compose chain rule grad_op = output_grads[0] grad_shuffle = GpuDimShuffle( input_broadcastable=( False, False, False, ), new_order=(1, 0, 2), )(gradients) grad_bdot = tt.batched_dot(grad_op, grad_shuffle) grad_shuffle_reverse = GpuDimShuffle( input_broadcastable=( False, False, False, ), new_order=(1, 0, 2), )(grad_bdot) return [ grad_shuffle_reverse, grad_undefined(self, 1, inputs[1]), grad_undefined(self, 2, inputs[2]), ]
def getDM_score_joint(kb_entities, kb_relations, neg_samples_kb, relations, opts): neg_samples = opts.neg_samples vect_dim = opts.vect_dim num_entities = opts.num_entities num_relations = opts.num_relations l2_reg_entities = opts.l2_entity # +1 for the OOV embedding. entities = Embedding(output_dim=vect_dim, input_dim=num_entities+1, init='normal',name = 'entity_embeddings_DM', W_regularizer=l2(l2_reg_entities)) entity_vectors = entities(kb_entities) entity_negative_vectors = entities(neg_samples_kb) relation_vectors = Flatten()(relations(kb_relations)) get_cross_1 = get_cross(0, neg_samples) e1_cross_e2_prime = merge([entity_vectors, entity_negative_vectors], mode = get_cross_1, output_shape = (neg_samples, vect_dim)) e1_cross_e2 = Lambda(cross_e1_e2, output_shape = (vect_dim,))(entity_vectors) score_DM = merge([relation_vectors, e1_cross_e2], mode = lambda X : T.batched_dot(X[0], X[1]), output_shape=()) score_DM_e2_corrupted = merge([relation_vectors, e1_cross_e2_prime], mode = 'dot', output_shape=(neg_samples,), dot_axes=(1,2)) if opts.add_loss: get_cross_2 = get_cross(1, neg_samples) e1_prime_cross_e2 = merge([entity_vectors, entity_negative_vectors], mode = get_cross_2, output_shape = (neg_samples, vect_dim)) score_DM_e1_corrupted = merge([relation_vectors, e1_prime_cross_e2], mode = 'dot', output_shape=(neg_samples,), dot_axes=(1,2)) else: score_DM_e1_corrupted = None return score_DM, score_DM_e1_corrupted, score_DM_e2_corrupted
def apply(self, idx, inp): ''' :param idx: vector of indices, one per sample :param inp: matrix (nb_samples, dims) :return: ''' return T.batched_dot(inp, self.W[idx-self.idxoffset])
def __call__(self, inputs, mask, h, encoder_outputs): """ decoder using gru layer :param inputs: input word indices, (batch_size, 1) :param mask: mask for inputs, (batch_size, 1) :param h: final state, (batch_size, hidden_size) :param encoder_outputs: output of encoder, (batch_size, max_length, hidden_size) :return: """ embedded = self.embedding[inputs.flatten()].reshape( (-1, self.hidden_size)) # batch*hidden_size attn_weights = T.nnet.softmax( self.linear_func( T.concatenate([embedded, h], 1), self.attn_W, self.attn_b)) # batch*(hidden_size*2)-> batch * max_length attn_weights = attn_weights.reshape((-1, 1, self.max_length)) attn_applied = T.batched_dot( attn_weights, encoder_outputs ) # batch*1*max_length * batch*max_length*hidden_size -> batch*1*hidden_size output = T.concatenate([embedded, attn_applied[:, 0, :]], 1) # b*(hidden_size*2) output = self.linear_func(output, self.attn_combine_W, self.attn_combine_b) # b*hidden_size output = output.reshape((-1, 1, self.hidden_size)) for i in xrange(self.num_layers): output = ReLU(output) output, h = self.gru_layer(output, mask, h) output = T.tensordot(output, self.linear, axes=[2, 0]) return output, h, attn_weights # b*1*vocab_size(unscaled), b*hidden_size, b*max_length
def get_output_for(self, inputs, **kwargs): M = inputs[0] u = inputs[1] output = T.batched_dot(M, u) if self.nonlinearity is not None: output = self.nonlinearity(output) return output
def mem_focus(memory, key, strength): """ mem_focus(memory, key, strength) -> weighting (batchsize x M) produces a weighting over memory positions based on a key @param memory: a batchsize x N x M 3-tensor @param key: a batchsize x 1 x N 3-tensor. mem_focus() is expected to output a weighting for each batch element @param strength: a batchsize x 1 matrix, sharpens the weighting """ # dot -> batchsize x 1 x M dot = T.batched_dot(key, memory) # memory_magnitude -> batchsize x M memory_magnitude = T.sqrt(T.sum(memory ** 2, axis = 1)) # key_magnitude -> batchsize x 1* key_magnitude = T.addbroadcast(T.sqrt(T.sum(key ** 2, axis = 2)), 1) # multiplied_magnitude -> batchsize x 1 x M multiplied_magnitude = (memory_magnitude * key_magnitude).dimshuffle([0, 'x', 1]) # cosine_similarity -> batchsize x 1 x M cosine_similarity = dot/(multiplied_magnitude + SMALL_CONSTANT) # strengthened_cosine_similarity -> batchsize x 1 x M strengthened_cosine_similarity = cosine_similarity * strength.dimshuffle([0, 1, 'x']) # weighting -> batchsize x M weighting = T.nnet.softmax(T.flatten(strengthened_cosine_similarity, outdim = 2)) return weighting
def __init__(self, x, y, l, window, opt, lr, init_emb, dim_emb, dim_hidden, n_vocab, L2_reg, unit, sim='cos', n_layers=1, activation=tanh): self.tr_inputs = [x, y, l] self.pr_inputs = [x, y, l] self.x = x # 1D: batch_size * l * 2, 2D: window; elem=word_id self.y = y # 1D: batch_size; elem=label self.l = l # scalar: elem=sentence length batch_size = y.shape[0] n_cands = x.shape[0] / batch_size / l self.pad = build_shared_zeros((1, dim_emb)) if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb)) else: self.emb = theano.shared(init_emb) self.E = T.concatenate([self.pad, self.emb], 0) self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden)) self.params = [self.emb, self.W_out] """ Input Layer """ e = self.E[x] # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb x_in = e.reshape((batch_size * n_cands, l, -1)) """ Intermediate Layer """ # h: 1D: n_batch * n_cands, 2D: dim_emb h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers, activation) self.params.extend(params) """ Output Layer """ h = h.reshape((batch_size, n_cands, -1)) h_1 = h[T.arange(batch_size), 0] h_2 = h[T.arange(batch_size), 1:] if sim == 'cos': y_score = cosign_similarity(h_1, h_2) else: y_score = T.batched_dot(T.dot(h_1, self.W_out), h_2.dimshuffle(0, 2, 1)) y_score_hat = T.max(y_score, 1) """ Objective Function """ self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size), y]) self.L2_sqr = regularization(self.params) self.cost = self.nll + L2_reg * self.L2_sqr / 2. """ Optimization """ if opt == 'adagrad': self.update = ada_grad(cost=self.cost, params=self.params, lr=lr) elif opt == 'ada_delta': self.update = ada_delta(cost=self.cost, params=self.params) elif opt == 'adam': self.update = adam(cost=self.cost, params=self.params, lr=lr) else: self.update = sgd(cost=self.cost, params=self.params, lr=lr) """ Predicts """ y_hat = T.argmax(y_score, 1) """ Check Accuracies """ self.correct = T.eq(y_hat, y)
def get_output_for(self, inputs, **kwargs): #input[0]:(BS,max_senlen,emb_size),input[1]:(BS,1,emb_size),input[2]:(BS,max_sentlen) # activation0=(T.dot(inputs[0],self.W_h)).reshape([self.batch_size,self.max_sentlen])+self.b_h.repeat(self.batch_size,0).repeat(self.max_sentlen,1) # activation1=T.dot(inputs[1],self.W_q).reshape([self.batch_size]).dimshuffle(0,'x') # activation2=T.batched_dot(T.dot(inputs[0],self.W_o),inputs[1].reshape([self.batch_size,self.embedding_size,1])).reshape([self.batch_size,self.max_sentlen]) activation2=T.batched_dot(inputs[0],inputs[1].reshape([self.batch_size,self.embedding_size,1])).reshape([self.batch_size,self.max_sentlen]) norm2=T.sqrt(T.sum(T.mul(inputs[0],inputs[0]),axis=2))+0.0000001 activation2=activation2/norm2 # activation=(self.nonlinearity(activation0)+self.nonlinearity(activation1)+activation2).reshape([self.batch_size,self.max_sentlen])#.dimshuffle(0,'x',2)#.repeat(self.max_sentlen,axis=1) activation2=(activation2).reshape([self.batch_size,self.max_sentlen])#.dimshuffle(0,'x',2)#.repeat(self.max_sentlen,axis=1) # final=T.dot(activation,self.W_o) #(BS,max_sentlen) activation3=T.batched_dot(inputs[0],inputs[1].reshape([self.batch_size,self.embedding_size,1])).reshape([self.batch_size,self.max_sentlen]) # if inputs[2] is not None: # final=inputs[2]*final-(1-inputs[2])*1000000 alpha=lasagne.nonlinearities.softmax(activation2) #(BS,max_sentlen) return alpha
def forward(self): z = self.z0 # sxd u = self.u_ # d w = self.w_ # d b = self.b # . h = self.h # f # h(sxd \dot d + .) = s if not self.batched: hwz = h(z.dot(w) + b) # s # sxd + (s \outer d) = sxd z1 = z + tt.outer(hwz, u) # sxd return z1 else: z = z.swapaxes(0, 1) # z bxsxd # u bxd # w bxd b = b.dimshuffle(0, 'x') # b bx- hwz = h(tt.batched_dot(z, w) + b) # bxs # bxsxd + (bxsx- * bx-xd) = bxsxd hwz = hwz.dimshuffle(0, 1, 'x') # bxsx- u = u.dimshuffle(0, 'x', 1) # bx-xd z1 = z + hwz * u # bxsxd return z1.swapaxes(0, 1) # sxbxd
def tf_update_state_batch(self, t_state_mat, t_obs_mat, t_act_mat): t_ofeat_mat = self._f_obs(t_obs_mat) t_afeat_mat = self._f_act(t_act_mat) K = self._feat_dim N = t_state_mat.shape[0] # Obtain extended state UU_efa = self._t_UU_efa C_ex = T.reshape(T.dot(t_state_mat, self._t_W_s2ex),(N, K.exfut_obs, K.exfut_act)) C_ex.name='tf_update_state::C_ex' # Condition on action B = T.reshape(T.dot(t_afeat_mat, UU_efa.T), (N, K.fut_act, K.exfut_act)).transpose(0,2,1) B.name = 'tf_update_state::B' #import pdb; pdb.set_trace() C_efo_fa = T.batched_dot(C_ex, B) C_efo_fa.name='tf_update_state::C_efo_fa' # Obtain v = C_oo\o_feat C_oo_prj = T.batched_dot(T.reshape(T.dot(t_state_mat,self._t_W_s2oo), (N, K.oo, K.act)), t_afeat_mat) C_oo_prj.name = 'tf_update_state::Cooprj' C_oo = T.reshape(T.dot(C_oo_prj, self._t_U_oo.T), (N, K.obs, K.obs)) C_oo.name='tf_update_state::C_oo' v = self._solve_batch(C_oo, t_ofeat_mat, self._lambda['filter']) v.name = 'tf_update_state::v' # Multply by v to condition on observation UU = self._t_UU_efo vproj = T.dot(v, UU) vproj.name ='tf_update_state::vproj' A = T.reshape(vproj,(N, K.exfut_obs, K.fut_obs)).transpose(0,2,1) A.name = 'tf_update_state::A' ss = T.batched_dot(A, C_efo_fa).reshape([N,-1]) ss.name = 'tf_update_state::ss_Cefodot' ss = T.dot(ss, self._t_UT_st.T) ss.name = 'tf_update_state::Uss_dot' ss = self._norm_method(ss) ss = self._smooth(ss, t_state_mat) self._dbg_batch = lambda : None self._dbg_batch.out = C_ex, C_oo, B, A, ss # Adding the sum of parameters fixes a Theano bug. return ss + sum(T.sum(p)*1e-30 for p in self.params)
def batched_batched_dot(s): """ from (x,y,z)-shaped pair, produce (x,y)-shaped pair that replaces the z-vector pairs by their dot-products """ import theano import theano.tensor as T return theano.scan(fn=lambda xm, ym: T.batched_dot(xm, ym), outputs_info=None, sequences=s, non_sequences=None)[0]
def _normalize_attention((att, mat)): if transpose: att = att.dimshuffle((0, 2, 1)) # 3d softmax e = K.exp(att - K.max(att, axis=-1, keepdims=True)) s = K.sum(e, axis=-1, keepdims=True) sm_att = e / s return T.batched_dot(sm_att, mat)