def _predict_k_star(self, k_star, x_star): """ Predict one test sample using algorithm (3.4) from GPML and assuming shared covariance matrix among all latent functions. """ # shortcuts C = self._n_outputs n = self._n_samples mu = (self._y - self.pi_).T.dot(k_star) Sigma = [] k_star_star = self._kernel(x_star, x_star) for c_ in xrange(C): b = self._e[c_] * k_star # _t = solve_triangular(self._M, b) # _t2 = solve_triangular(self._M, _t, trans='T') _t2 = b / np.maximum(sum(self._e), 1e-8 * np.ones_like(self._e[0])) c = self._e[c_] * _t2 sigma_row = [c.dot(k_star)] * C sigma_row[c_] += (k_star_star - b.dot(k_star)) Sigma.append(sigma_row) Sigma = np.asarray(Sigma) f_star = self._rng.multivariate_normal(mu, Sigma, size=self.n_samples) pi_star = softmax(f_star) return np.mean(pi_star, axis=0)
def rnn_cell_forward(Xt,h_prev,parameters): ''' RNN Cell: Input: - Xt: (N,D) N=2000 D=28 - h_prev: (N,H) #of neurons in the hidden state. "prev" is actually for timestep "t-1" - parameters: : Wx: Weight matrix multiplying the input Xt, (D, H) : Wh: Weight matrix multiplying the hidden state (H,H) : Wy: Weight matrix relating to the hidden-state. Shape is (H,M) # M = 10 : bh: Bias, (1, H) : by: Bias, (1, M) Returns: - h_next: next hidden state (N, H) - yt_pred: prediction at timestep t, (N, M) - cache : tuple of values needed for the back-propagation part, has shape (h_next, h_prev, Xt, parameters) ''' Wx = parameters["Wx"] Wh = parameters["Wh"] Wy = parameters["Wy"] bh = parameters["bh"] by = parameters["by"] # compute next activation state using the formula tanh(xxxx) h_next = tanh(np.dot(Xt,Wx) + np.dot(h_prev,Wh) + bh) yt_pred = softmax(np.dot(h_next, Wy) + by) cache = (h_next, h_prev, Xt, parameters) return h_next, yt_pred, cache
def __init__(self): self.layers = [] self.history = {"loss": []} self.cost = None self.activation_funcs = { "relu": activation.relu(), "softmax": activation.softmax(), "sigmoid": activation.sigmoid(), "linear": activation.identity(), "tanh": activation.tanh(), "swish": activation.swish(), "lrelu": activation.lrelu() } self.cost_funcs = { "squared loss": error.SquaredError(), "cross entropy": error.CrossEntropy() } self.layer_types = { "dense": layers.Dense, }
def build_decoder(self, query_tokens, query_token_embed, query_token_embed_mask, mask): logging.info('building decoder ...') # mask = ndim_itensor(2, 'mask') # (batch_size, decoder_state_dim) decoder_prev_state = ndim_tensor(2, name='decoder_prev_state') # (batch_size, decoder_state_dim) decoder_prev_cell = ndim_tensor(2, name='decoder_prev_cell') # (batch_size, n_timestep, decoder_state_dim) hist_h = ndim_tensor(3, name='hist_h') # (batch_size, decoder_state_dim) prev_action_embed = ndim_tensor(2, name='prev_action_embed') # (batch_size) node_id = T.ivector(name='node_id') # (batch_size, node_embed_dim) node_embed = self.node_embedding[node_id] # (batch_size) par_rule_id = T.ivector(name='par_rule_id') # (batch_size, decoder_state_dim) par_rule_embed = T.switch(par_rule_id[:, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[par_rule_id]) # ([time_step]) time_steps = T.ivector(name='time_steps') # (batch_size) parent_t = T.ivector(name='parent_t') # (batch_size, 1) parent_t_reshaped = T.shape_padright(parent_t) # mask = ndim_itensor(2, 'mask') query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, train=False) # (batch_size, 1, decoder_state_dim) prev_action_embed_reshaped = prev_action_embed.dimshuffle((0, 'x', 1)) # (batch_size, 1, node_embed_dim) node_embed_reshaped = node_embed.dimshuffle((0, 'x', 1)) # (batch_size, 1, node_embed_dim) par_rule_embed_reshaped = par_rule_embed.dimshuffle((0, 'x', 1)) if not config.frontier_node_type_feed: node_embed_reshaped *= 0. if not config.parent_action_feed: par_rule_embed_reshaped *= 0. decoder_input = T.concatenate([ prev_action_embed_reshaped, node_embed_reshaped, par_rule_embed_reshaped ], axis=-1) # (batch_size, 1, decoder_state_dim) # (batch_size, 1, decoder_state_dim) # (batch_size, 1, field_token_encode_dim) decoder_next_state_dim3, decoder_next_cell_dim3, ctx_vectors = self.decoder_lstm( decoder_input, init_state=decoder_prev_state, init_cell=decoder_prev_cell, hist_h=hist_h, context=query_embed, context_mask=query_token_embed_mask, parent_t_seq=parent_t_reshaped, dropout=config.dropout, train=False, time_steps=time_steps) decoder_next_state = decoder_next_state_dim3.flatten(2) # decoder_output = decoder_next_state * (1 - DECODER_DROPOUT) decoder_next_cell = decoder_next_cell_dim3.flatten(2) decoder_next_state_trans_rule = self.decoder_hidden_state_W_rule( decoder_next_state) decoder_next_state_trans_token = self.decoder_hidden_state_W_token( T.concatenate([decoder_next_state, ctx_vectors.flatten(2)], axis=-1)) rule_prob = softmax( T.dot(decoder_next_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) gen_action_prob = self.terminal_gen_softmax(decoder_next_state) # vocab_prob = softmax(T.dot(decoder_next_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) logits = T.dot(decoder_next_state_trans_token, T.transpose( self.vocab_embedding_W)) + self.vocab_embedding_b # vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) test = T.dot((T.min(logits, axis=1, keepdims=True) - 1), (1 - mask).reshape((1, mask.shape[1]))) vocab_prob = softmax(logits * mask + test) # vocab_prob = softmax( # logits.transpose(1, 0, 2) * mask + (T.min(logits.transpose(1, 0, 2), axis=1, keepdims=True) - 1) * ( # 1 - mask)).transpose(1, 0, 2) ptr_net_decoder_state = T.concatenate( [decoder_next_state_dim3, ctx_vectors], axis=-1) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) copy_prob = copy_prob.flatten(2) inputs = [query_tokens] outputs = [query_embed, query_token_embed_mask] self.decoder_func_init = theano.function(inputs, outputs) inputs = [ time_steps, decoder_prev_state, decoder_prev_cell, hist_h, prev_action_embed, node_id, par_rule_id, parent_t, query_embed, query_token_embed_mask, mask ] outputs = [ decoder_next_state, decoder_next_cell, rule_prob, gen_action_prob, vocab_prob, copy_prob ] self.decoder_func_next_step = theano.function(inputs, outputs)
def build(self): # (batch_size, max_example_action_num, action_type) tgt_action_seq = ndim_itensor(3, 'tgt_action_seq') # (batch_size, max_example_action_num, action_type) tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type') # (batch_size, max_example_action_num) tgt_node_seq = ndim_itensor(2, 'tgt_node_seq') # (batch_size, max_example_action_num) tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq') # (batch_size, max_example_action_num) tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq') # (batch_size, max_example_action_num, symbol_embed_dim) # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False) tgt_node_embed = self.node_embedding[tgt_node_seq] # (batch_size, max_query_length) query_tokens = ndim_itensor(2, 'query_tokens') mask = T.TensorType(dtype='int32', name='mask', broadcastable=(True, False))() # (batch_size, max_query_length, query_token_embed_dim) # (batch_size, max_query_length) query_token_embed, query_token_embed_mask = self.query_embedding( query_tokens, mask_zero=True) # if WORD_DROPOUT > 0: # logging.info('used word dropout for source, p = %f', WORD_DROPOUT) # query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False) batch_size = tgt_action_seq.shape[0] max_example_action_num = tgt_action_seq.shape[1] # previous action embeddings # (batch_size, max_example_action_num, action_embed_dim) tgt_action_seq_embed = T.switch( T.shape_padright(tgt_action_seq[:, :, 0] > 0), self.rule_embedding_W[tgt_action_seq[:, :, 0]], self.vocab_embedding_W[tgt_action_seq[:, :, 1]]) tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed) # parent rule application embeddings tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[tgt_par_rule_seq]) if not config.frontier_node_type_feed: tgt_node_embed *= 0. if not config.parent_action_feed: tgt_par_rule_embed *= 0. # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim) decoder_input = T.concatenate( [tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed], axis=-1) # (batch_size, max_query_length, query_embed_dim) query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, srng=self.srng) # (batch_size, max_example_action_num) tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1) # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state) # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim) decoder_hidden_states, _, ctx_vectors = self.decoder_lstm( decoder_input, context=query_embed, context_mask=query_token_embed_mask, mask=tgt_action_seq_mask, parent_t_seq=tgt_par_t_seq, dropout=config.dropout, srng=self.srng) # if DECODER_DROPOUT > 0: # logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT) # decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states) # ==================================================== # apply additional non-linearity transformation before # predicting actions # ==================================================== decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule( decoder_hidden_states) decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token( T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1)) # (batch_size, max_example_action_num, rule_num) rule_predict = softmax( T.dot(decoder_hidden_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) # (batch_size, max_example_action_num, 2) terminal_gen_action_prob = self.terminal_gen_softmax( decoder_hidden_states) # (batch_size, max_example_action_num, target_vocab_size) logits = T.dot(decoder_hidden_state_trans_token, T.transpose( self.vocab_embedding_W)) + self.vocab_embedding_b # vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) vocab_predict = softmax( logits.transpose(1, 0, 2) * mask + (T.min(logits.transpose(1, 0, 2), axis=1, keepdims=True) - 1) * (1 - mask)).transpose(1, 0, 2) # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim) ptr_net_decoder_state = T.concatenate( [decoder_hidden_states, ctx_vectors], axis=-1) # (batch_size, max_example_action_num, max_query_length) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) # (batch_size, max_example_action_num) rule_tgt_prob = rule_predict[ T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 0]] # (batch_size, max_example_action_num) vocab_tgt_prob = vocab_predict[ T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 1]] # (batch_size, max_example_action_num) copy_tgt_prob = copy_prob[ T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 2]] # (batch_size, max_example_action_num) tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \ tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \ tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask)) loss = -(likelihood * tgt_action_seq_mask).sum( axis=-1) # / tgt_action_seq_mask.sum(axis=-1) loss = T.mean(loss) # let's build the function! train_inputs = [ query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq, tgt_par_rule_seq, tgt_par_t_seq, mask ] optimizer = optimizers.get(config.optimizer) optimizer.clip_grad = config.clip_grad updates, grads = optimizer.get_updates(self.params, loss) self.train_func = theano.function( train_inputs, [loss], # [loss, tgt_action_seq_type, tgt_action_seq, # rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob, # copy_prob, terminal_gen_action_prob], updates=updates) # if WORD_DROPOUT > 0: # self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask) # else: # self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask) self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask, mask)
def build_decoder(self, query_tokens, query_token_embed, query_token_embed_mask, query_tokens_phrase, query_tokens_pos, query_tokens_canon_id): logging.info('building decoder ...') # (batch_size, decoder_state_dim) decoder_prev_state = ndim_tensor(2, name='decoder_prev_state') # (batch_size, decoder_state_dim) decoder_prev_cell = ndim_tensor(2, name='decoder_prev_cell') # (batch_size, n_timestep, decoder_state_dim) hist_h = ndim_tensor(3, name='hist_h') # (batch_size, decoder_state_dim) prev_action_embed = ndim_tensor(2, name='prev_action_embed') # (batch_size) node_id = T.ivector(name='node_id') # (batch_size, node_embed_dim) node_embed = self.node_embedding[node_id] # (batch_size) par_rule_id = T.ivector(name='par_rule_id') # (batch_size, decoder_state_dim) par_rule_embed = T.switch(par_rule_id[:, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[par_rule_id]) # ([time_step]) time_steps = T.ivector(name='time_steps') # (batch_size) parent_t = T.ivector(name='parent_t') # (batch_size, 1) parent_t_reshaped = T.shape_padright(parent_t) # concatenate query_token_embed with query_tokens_phrase and query_tokens_pos # (batch_size, max_query_length, query_embed_dim + 2) new_query_token_embed = self.concatenate(query_token_embed, query_tokens_phrase, query_tokens_pos, query_tokens_canon_id) query_embed = self.query_encoder_lstm(new_query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, train=False) # (batch_size, 1, decoder_state_dim) prev_action_embed_reshaped = prev_action_embed.dimshuffle((0, 'x', 1)) # (batch_size, 1, node_embed_dim) node_embed_reshaped = node_embed.dimshuffle((0, 'x', 1)) # (batch_size, 1, node_embed_dim) par_rule_embed_reshaped = par_rule_embed.dimshuffle((0, 'x', 1)) if not config.frontier_node_type_feed: node_embed_reshaped *= 0. if not config.parent_action_feed: par_rule_embed_reshaped *= 0. decoder_input = T.concatenate([ prev_action_embed_reshaped, node_embed_reshaped, par_rule_embed_reshaped ], axis=-1) # (batch_size, 1, decoder_state_dim) # (batch_size, 1, decoder_state_dim) # (batch_size, 1, field_token_encode_dim) decoder_next_state_dim3, decoder_next_cell_dim3, ctx_vectors = self.decoder_lstm( decoder_input, init_state=decoder_prev_state, init_cell=decoder_prev_cell, hist_h=hist_h, context=query_embed, context_mask=query_token_embed_mask, parent_t_seq=parent_t_reshaped, dropout=config.dropout, train=False, time_steps=time_steps) decoder_next_state = decoder_next_state_dim3.flatten(2) # decoder_output = decoder_next_state * (1 - DECODER_DROPOUT) decoder_next_cell = decoder_next_cell_dim3.flatten(2) decoder_next_state_trans_rule = self.decoder_hidden_state_W_rule( decoder_next_state) decoder_next_state_trans_token = self.decoder_hidden_state_W_token( T.concatenate([decoder_next_state, ctx_vectors.flatten(2)], axis=-1)) rule_prob = softmax( T.dot(decoder_next_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) gen_action_prob = self.terminal_gen_softmax(decoder_next_state) vocab_prob = softmax( T.dot(decoder_next_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) ptr_net_decoder_state = T.concatenate( [decoder_next_state_dim3, ctx_vectors], axis=-1) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) copy_prob = copy_prob.flatten(2) inputs = [ query_tokens, query_tokens_phrase, query_tokens_pos, query_tokens_canon_id ] outputs = [query_embed, query_token_embed_mask] self.decoder_func_init = theano.function(inputs, outputs, allow_input_downcast=True, on_unused_input='ignore') inputs = [ time_steps, decoder_prev_state, decoder_prev_cell, hist_h, prev_action_embed, node_id, par_rule_id, parent_t, query_embed, query_token_embed_mask ] outputs = [ decoder_next_state, decoder_next_cell, rule_prob, gen_action_prob, vocab_prob, copy_prob ] self.decoder_func_next_step = theano.function(inputs, outputs)
def build(self): # (batch_size, max_example_action_num, action_type) tgt_action_seq = ndim_itensor(3, 'tgt_action_seq') # (batch_size, max_example_action_num, action_type) tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type') # (batch_size, max_example_action_num) tgt_node_seq = ndim_itensor(2, 'tgt_node_seq') # (batch_size, max_example_action_num) tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq') # (batch_size, max_example_action_num) tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq') # (batch_size, max_example_action_num, symbol_embed_dim) # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False) tgt_node_embed = self.node_embedding[tgt_node_seq] # (batch_size, max_query_length) query_tokens = ndim_itensor(2, 'query_tokens') # (batch_size, max_query_length, query_token_embed_dim) # (batch_size, max_query_length) query_token_embed, query_token_embed_mask = self.query_embedding(query_tokens, mask_zero=True) # if WORD_DROPOUT > 0: # logging.info('used word dropout for source, p = %f', WORD_DROPOUT) # query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False) batch_size = tgt_action_seq.shape[0] max_example_action_num = tgt_action_seq.shape[1] # previous action embeddings # (batch_size, max_example_action_num, action_embed_dim) tgt_action_seq_embed = T.switch(T.shape_padright(tgt_action_seq[:, :, 0] > 0), self.rule_embedding_W[tgt_action_seq[:, :, 0]], self.vocab_embedding_W[tgt_action_seq[:, :, 1]]) tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed) # parent rule application embeddings tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[tgt_par_rule_seq]) if not config.frontier_node_type_feed: tgt_node_embed *= 0. if not config.parent_action_feed: tgt_par_rule_embed *= 0. # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim) decoder_input = T.concatenate([tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed], axis=-1) # (batch_size, max_query_length, query_embed_dim) query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, srng=self.srng) # (batch_size, max_example_action_num) tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1) # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state) # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim) decoder_hidden_states, _, ctx_vectors = self.decoder_lstm(decoder_input, context=query_embed, context_mask=query_token_embed_mask, mask=tgt_action_seq_mask, parent_t_seq=tgt_par_t_seq, dropout=config.dropout, srng=self.srng) # if DECODER_DROPOUT > 0: # logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT) # decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states) # ==================================================== # apply additional non-linearity transformation before # predicting actions # ==================================================== decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule(decoder_hidden_states) decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token(T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1)) # (batch_size, max_example_action_num, rule_num) rule_predict = softmax(T.dot(decoder_hidden_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) # (batch_size, max_example_action_num, 2) terminal_gen_action_prob = self.terminal_gen_softmax(decoder_hidden_states) # (batch_size, max_example_action_num, target_vocab_size) vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim) ptr_net_decoder_state = T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1) # (batch_size, max_example_action_num, max_query_length) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) # (batch_size, max_example_action_num) rule_tgt_prob = rule_predict[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 0]] # (batch_size, max_example_action_num) vocab_tgt_prob = vocab_predict[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 1]] # (batch_size, max_example_action_num) copy_tgt_prob = copy_prob[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 2]] # (batch_size, max_example_action_num) tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \ tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \ tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask)) loss = - (likelihood * tgt_action_seq_mask).sum(axis=-1) # / tgt_action_seq_mask.sum(axis=-1) loss = T.mean(loss) # let's build the function! train_inputs = [query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq, tgt_par_rule_seq, tgt_par_t_seq] optimizer = optimizers.get(config.optimizer) optimizer.clip_grad = config.clip_grad updates, grads = optimizer.get_updates(self.params, loss) self.train_func = theano.function(train_inputs, [loss], # [loss, tgt_action_seq_type, tgt_action_seq, # rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob, # copy_prob, terminal_gen_action_prob], updates=updates) # if WORD_DROPOUT > 0: # self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask) # else: # self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask) self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)
def build_decoder(self, query_tokens, query_token_embed, query_token_embed_mask): logging.info('building decoder ...') # (batch_size, decoder_state_dim) decoder_prev_state = ndim_tensor(2, name='decoder_prev_state') # (batch_size, decoder_state_dim) decoder_prev_cell = ndim_tensor(2, name='decoder_prev_cell') # (batch_size, n_timestep, decoder_state_dim) hist_h = ndim_tensor(3, name='hist_h') # (batch_size, decoder_state_dim) prev_action_embed = ndim_tensor(2, name='prev_action_embed') # (batch_size) node_id = T.ivector(name='node_id') # (batch_size, node_embed_dim) node_embed = self.node_embedding[node_id] # (batch_size) par_rule_id = T.ivector(name='par_rule_id') # (batch_size, decoder_state_dim) par_rule_embed = T.switch(par_rule_id[:, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[par_rule_id]) # ([time_step]) time_steps = T.ivector(name='time_steps') # (batch_size) parent_t = T.ivector(name='parent_t') # (batch_size, 1) parent_t_reshaped = T.shape_padright(parent_t) query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, train=False) # (batch_size, 1, decoder_state_dim) prev_action_embed_reshaped = prev_action_embed.dimshuffle((0, 'x', 1)) # (batch_size, 1, node_embed_dim) node_embed_reshaped = node_embed.dimshuffle((0, 'x', 1)) # (batch_size, 1, node_embed_dim) par_rule_embed_reshaped = par_rule_embed.dimshuffle((0, 'x', 1)) if not config.frontier_node_type_feed: node_embed_reshaped *= 0. if not config.parent_action_feed: par_rule_embed_reshaped *= 0. decoder_input = T.concatenate([prev_action_embed_reshaped, node_embed_reshaped, par_rule_embed_reshaped], axis=-1) # (batch_size, 1, decoder_state_dim) # (batch_size, 1, decoder_state_dim) # (batch_size, 1, field_token_encode_dim) decoder_next_state_dim3, decoder_next_cell_dim3, ctx_vectors = self.decoder_lstm(decoder_input, init_state=decoder_prev_state, init_cell=decoder_prev_cell, hist_h=hist_h, context=query_embed, context_mask=query_token_embed_mask, parent_t_seq=parent_t_reshaped, dropout=config.dropout, train=False, time_steps=time_steps) decoder_next_state = decoder_next_state_dim3.flatten(2) # decoder_output = decoder_next_state * (1 - DECODER_DROPOUT) decoder_next_cell = decoder_next_cell_dim3.flatten(2) decoder_next_state_trans_rule = self.decoder_hidden_state_W_rule(decoder_next_state) decoder_next_state_trans_token = self.decoder_hidden_state_W_token(T.concatenate([decoder_next_state, ctx_vectors.flatten(2)], axis=-1)) rule_prob = softmax(T.dot(decoder_next_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) gen_action_prob = self.terminal_gen_softmax(decoder_next_state) vocab_prob = softmax(T.dot(decoder_next_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) ptr_net_decoder_state = T.concatenate([decoder_next_state_dim3, ctx_vectors], axis=-1) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) copy_prob = copy_prob.flatten(2) inputs = [query_tokens] outputs = [query_embed, query_token_embed_mask] self.decoder_func_init = theano.function(inputs, outputs) inputs = [time_steps, decoder_prev_state, decoder_prev_cell, hist_h, prev_action_embed, node_id, par_rule_id, parent_t, query_embed, query_token_embed_mask] outputs = [decoder_next_state, decoder_next_cell, rule_prob, gen_action_prob, vocab_prob, copy_prob] self.decoder_func_next_step = theano.function(inputs, outputs)
def _fit(self, X, y): """ Compute mode of approximation of the posterior using algorithm (3.3) from GPML with shared covariance matrix among all latent functions. """ if len(y.shape) == 1 or y.shape[1] == 1: y = one_hot(y) self._check_X_y(X, y) y = y.astype(np.float32) self._kernel = get_kernel(self.kernel, **self.kernel_params) # shortcuts C = self._n_outputs n = self._n_samples # construct covariance matrix [if needed] # if self.K_ is None: self.K_ = self._kernel(X, X) self.K_ += self.sigma_n**2 * np.eye(n) self.K_ = self.K_.astype(np.float32) # init latent function values self.f_ = np.zeros_like(y) lmls = [] iter = 0 while True: iter += 1 if iter > self.max_iter: print 'convergence is not reached' return self.pi_ = softmax(self.f_) z = [] self._e = [] for c_ in xrange(C): # compute E_c sqrt_d_c = np.sqrt(self.pi_[:, c_]) _T = np.eye( self._n_samples) + (sqrt_d_c * self.K_.T).T * sqrt_d_c if self.algorithm == 'exact': L = cholesky(_T, lower=True, overwrite_a=True) _T2 = solve_triangular(L, sqrt_d_c) e_c = sqrt_d_c * solve_triangular(L, _T2, trans='T') elif self.algorithm == 'cg': _t, _ = cg(_T, sqrt_d_c, tol=self.cg_tol, maxiter=self.cg_max_iter) _t = _t.astype(np.float32) e_c = sqrt_d_c * _t self._e.append(e_c) # compute z_c if self.algorithm == 'exact': z_c = sum(np.log(L.diagonal())) z.append(z_c) # compute b # b = (D - Pi.dot(Pi.T)).dot(self.f_.T.reshape((C * n,))) # b = b.reshape((n, C)) b = (1. - self.pi_) * self.pi_ * self.f_ b = b + y - self.pi_ # compute c c = np.hstack((self._e[c_] * self.K_.dot(b[:, c_]))[:, np.newaxis] for c_ in xrange(C)) # compute a # self._M = cholesky(np.diag(sum(self._e)), lower=True, overwrite_a=True) # _t = np.sum(c, axis=1) # _t2 = solve_triangular(self._M, _t) # _t3 = solve_triangular(self._M, _t2, trans='T') _t3 = np.sum(c, axis=1) / np.maximum( sum(self._e), 1e-8 * np.ones_like(self._e[0])) _t4 = np.hstack( (self._e[c_] * _t3)[:, np.newaxis] for c_ in xrange(C)) a = b - c + _t4 a = a.astype(np.float32) # compute f self.f_ = self.K_.dot(a) # compute approx. LML lml = -0.5 * sum(a[:, _c].dot(self.f_[:, _c]) for _c in xrange(C)) # -0.5a^Tf lml += sum(y[:, _c].dot(self.f_[:, _c]) for _c in xrange(C)) # y^Tf lml -= sum(log_sum_exp(f) for f in self.f_) lml -= sum(z) lmls.append(lml) if len(lmls) >= 2 and np.abs(lmls[-1] - lmls[-2]) < self.tol * self.K_.max(): break self.lml_ = lmls[-1]
def lstm_cell_forward(Xt, h_prev, c_prev, parameters): """ Input: - Xt: Input data at timestep "t", shape: (N, D) : N : #of samples. : D : #of input examples. D = 28 in MNIST dataset - h_prev: Hidden state at timestep "t-1", shape: (N, H) : N : #of samples. : H : #of hidden neurans - c_prev: Memory state at timestep "t-1", shape: (N,H) - parameters: a dictionary containing: : Wf : Weight matrix of the forget gate, shape (H+D, H) : Wi : Weight matrix of the update gate, shape (H+D, H) : Wo : Weight matrix of the output gate, shape (H+D, H) : Wc : Weight matrix of the first "tanh", shape (H+D, H) : Wy : Weight matrix relating the hidden-state to the output, shape (H, M), M = 10 in MNIST dataset : bf : Bias, shape (1, H) : bi : Bias, shape (1, H) : bo : Bias, shape (1, H) : bc : Bias, shape (1, H) : by : Bias, shape (1, M) Returns: - h_next : next hidden state, shape (N, H) - c_next : next memory state, shape (N, H) - yt_pred: prediction at timestep "t", shape (N, M) - cache : tuple of values needed for the backward pass, contains (h_next, c_next, h_prev, c_prev, Xt, parameters) Note: ft/it/ot stand for the forget/update/output gates, cct stands for the candidate value (c tilde), c stands for the memory value """ # Retrieve parameters from "parameters" Wf = parameters["Wf"] Wi = parameters["Wi"] Wo = parameters["Wo"] Wc = parameters["Wc"] Wy = parameters["Wy"] bf = parameters["bf"] bi = parameters["bi"] bo = parameters["bo"] bc = parameters["bc"] by = parameters["by"] # Retrieve dimensions from shapes of Xt and Wy N, D = Xt.shape H, M = Wy.shape # Concatenate h_prev and Xt concat = np.zeros((N, H+D)) concat[:, :H] = h_prev concat[:, H:] = Xt # Compute values for ft, it, cct, c_next, ot, h_next ft = sigmoid(np.dot(concat, Wf) + bf) it = sigmoid(np.dot(concat, Wi) + bi) ot = sigmoid(np.dot(concat, Wo) + bo) cct = np.tanh(np.dot(concat, Wc) + bc) c_next = ft * c_prev + it * cct h_next = ot * np.tanh(c_next) # Compute prediction of the LSTM cell yt_pred = softmax(np.dot(h_next, Wy) + by) # store values needed for backward propagation in cache cache = (h_next, c_next, h_prev, c_prev, ft, it, cct, ot, Xt, parameters) return h_next, c_next, yt_pred, cache