def decomp_attend(self, vecsA, vecsB): # Fq^T Fc -> need to expedite using native matrix/tensor multiplication Fq = vecsA # the original word vector, not yet passing a NN as in Eq.1, # need a function F Fc = vecsB # need a function F expE = [] for fq in Fq: row = [] for fc in Fc: row.append(dt.exp(dt.dot_product(fq, fc))) expE.append(row) #print ("debug: expE", expE[0][0].value()) invSumExpEi = [] for i in xrange(len(Fq)): invSumExpEi.append(dt.pow(dt.esum(expE[i]), dt.scalarInput(-1))) invSumExpEj = [] for j in xrange(len(Fc)): invSumExpEj.append( dt.pow(dt.esum([expE[i][j] for i in xrange(len(Fq))]), dt.scalarInput(-1))) beta = [] for i in xrange(len(Fq)): s = dt.esum([Fc[j] * expE[i][j] for j in xrange(len(Fc))]) beta.append(s * invSumExpEi[i]) #print("debug: beta", beta[0].value()) alpha = [] for j in xrange(len(Fc)): s = dt.esum([Fc[j] * expE[i][j] for i in xrange(len(Fq))]) alpha.append(s * invSumExpEj[j]) #print("debug: alpha", alpha[0].value()) # Compare v1i = [ dt.logistic(dt.concatenate([Fq[i], beta[i]])) for i in xrange(len(Fq)) ] # need a function G v2j = [ dt.logistic(dt.concatenate([Fc[j], alpha[j]])) for j in xrange(len(Fc)) ] # need a function G #print ("debug: v1i", v1i[0].value()) #print ("debug: v2j", v2j[0].value()) # Aggregate v1 = dt.esum(v1i) v2 = dt.esum(v2j) #print ("debug: v1.value()", v1.value()) #print ("debug: v2.value()", v2.value()) #colScore = dt.logistic(dt.dot_product(self.SelHW, dt.concatenate([v1,v2]))) return dt.dot_product(v1, v2)
def intra_sent_attend(self, vecs): numVecs = len(vecs) fVecs = [dt.tanh(self.SelIntraFW * v) for v in vecs] expE = [] for i, fq in enumerate(fVecs): row = [] for j, fc in enumerate(fVecs): row.append( dt.exp( dt.dot_product(fq, fc) + self.SelIntraBias[i - j + int(config.d["DIST_BIAS_DIM"] / 2)])) expE.append(row) invSumExpE = [] for i in xrange(numVecs): invSumExpE.append(dt.pow(dt.esum(expE[i]), dt.scalarInput(-1))) alpha = [] for i in xrange(numVecs): s = dt.esum([vecs[j] * expE[i][j] for j in xrange(numVecs)]) alpha.append(s * invSumExpE[i]) return [ dt.tanh(self.SelIntraHW * dt.concatenate([v, a])) for v, a in zip(vecs, alpha) ]
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def loss_function(recon_x, x, mu, logvar): BCE = dy.binary_log_loss(recon_x, x) # equiv to torch.nn.functional.binary_cross_entropy(?,?, size_average=False) # see Appendix B from VAE paper: # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 # https://arxiv.org/abs/1312.6114 # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD = -0.5 * dy.sum_elems(1 + logvar - dy.pow(mu, dy.scalarInput(2)) - dy.exp(logvar)) return BCE + KLD
def loss_function(recon_x, x, mu, logvar): BCE = dy.binary_log_loss( recon_x, x ) # equiv to torch.nn.functional.binary_cross_entropy(?,?, size_average=False) # see Appendix B from VAE paper: # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 # https://arxiv.org/abs/1312.6114 # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD = -0.5 * dy.sum_elems(1 + logvar - dy.pow(mu, dy.scalarInput(2)) - dy.exp(logvar)) return BCE + KLD
def learn(self, batch_size): if self.prioritized: if not self.memory.is_full(): return -np.inf indices, exps, weights = self.memory.sample(batch_size, self.beta) else: exps = self.memory.sample(batch_size) obss, actions, rewards, obs_nexts, dones = self._process(exps) dy.renew_cg() target_network = self.target_network if self.use_double_dqn else self.network if self.dueling: target_values, v = target_network(obs_nexts, batched=True) target_values = target_values.npvalue() + v.npvalue() else: target_values = target_network(obs_nexts, batched=True) target_values = target_values.npvalue() target_values = np.max(target_values, axis=0) target_values = rewards + self.reward_decay * (target_values * (1 - dones)) dy.renew_cg() if self.dueling: all_values_expr, v = self.network(obss, batched=True) else: all_values_expr = self.network(obss, batched=True) picked_values = dy.pick_batch(all_values_expr, actions) diff = (picked_values + v if self.dueling else picked_values) - dy.inputTensor(target_values, batched=True) if self.prioritized: self.memory.update(indices, np.transpose(np.abs(diff.npvalue()))) losses = dy.pow(diff, dy.constant(1, 2)) if self.prioritized: losses = dy.cmult(losses, dy.inputTensor(weights, batched=True)) loss = dy.sum_batches(losses) loss_value = loss.npvalue() loss.backward() self.trainer.update() self.epsilon = max(self.epsilon - self.epsilon_decrease, self.epsilon_lower) if self.prioritized: self.beta = min(self.beta + self.beta_increase, 1.) self.learn_step += 1 if self.use_double_dqn and self.learn_step % self.n_replace_target == 0: self.target_network.update(self.network) return loss_value
def calc_loss_basic(self, frames, label): # Renew the computation graph dy.renew_cg() # Initialize LSTM init_state_src = self.lstm_builder.initial_state() # Instantiate the params W_mean = dy.parameter(self.W_mean_p) V_mean = dy.parameter(self.V_mean_p) b_mean = dy.parameter(self.b_mean_p) W_var = dy.parameter(self.W_var_p) V_var = dy.parameter(self.V_var_p) b_var = dy.parameter(self.b_var_p) input_frames = dy.inputTensor(frames) output_label = label # Get the LSTM embeddings src_output = init_state_src.add_inputs( [frame for frame in input_frames])[-1].output() # Get the mean and diagonal log covariance from the encoder mu = self.mlp(src_output, W_mean, V_mean, b_mean) log_var = self.mlp(src_output, W_mean, V_mean, b_mean) # Compute the KL Divergence loss kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) # Reparameterize z = self.reparameterize(mu, log_var) W_sm = dy.parameter(self.W_sm_p) b_sm = dy.parameter(self.b_sm_p) # Calculate the reconstruction loss pred = dy.affine_transform([b_sm, W_sm, z]) label_embedding = self.lookup[label] #print label, label_embedding recons_loss = dy.pickneglogsoftmax(pred, label) return kl_loss, recons_loss
def learn(self, src, dst): softmax_list, aux_list = self._predict(src, dst=dst, num_predictions=len(dst) + 1, runtime=False) for softmax, aux, entry in zip(softmax_list, aux_list, dst): word = entry.word.decode('utf-8').lower() if word in self.output_encodings.word2int: w_index = self.output_encodings.word2int[word] else: w_index = self.output_encodings.word2int["<UNK>"] w_emb, found = self.dst_we.get_word_embeddings(entry.word.decode('utf-8')) self.losses.append(-dy.log(dy.pick(softmax, w_index))) if found: vec1=aux vec2=dy.inputVector(w_emb) cosine = dy.dot_product(vec1, vec2) * dy.pow(dy.l2_norm(vec1) * dy.l2_norm(vec2), dy.scalarInput(-1)) self.losses.append(dy.squared_distance(cosine, dy.scalarInput(1.0))) self.losses.append(-dy.log(dy.pick(softmax_list[-1], self.EOS)))
def calc_loss_basic(self, embedding, label): # Renew the computation graph dy.renew_cg() # Instantiate the params W_mean = dy.parameter(self.W_mean_p) V_mean = dy.parameter(self.V_mean_p) b_mean = dy.parameter(self.b_mean_p) W_var = dy.parameter(self.W_var_p) V_var = dy.parameter(self.V_var_p) b_var = dy.parameter(self.b_var_p) input_embedding = dy.inputTensor(embedding) output_label = label # Get the LSTM embeddings src_output = self.dnn.predict(input_embedding) # Get the mean and diagonal log covariance from the encoder mu = self.mlp(src_output, W_mean, V_mean, b_mean) log_var = self.mlp(src_output, W_mean, V_mean, b_mean) # Compute the KL Divergence loss kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) # Reparameterize z = self.reparameterize(mu, log_var) W_sm = dy.parameter(self.W_sm_p) b_sm = dy.parameter(self.b_sm_p) # Calculate the reconstruction loss pred = dy.selu(dy.affine_transform([b_sm, W_sm, z])) label_embedding = self.lookup[label] recons_loss = dy.pickneglogsoftmax(pred, label) return kl_loss, recons_loss
def get_regularization(self, batch_preds, word2int): reg = 0 # compute regularization on the parameters for key, value in self.params.iteritems(): if key != "b" and key != "lookup": expression = dy.parameter(value) reg += dy.sum_elems(dy.pow(expression, dy.scalarInput(2))) if key == "lookup": for example in batch_preds: premise = example[0] hypothesis = example[1] premise_seq = [value[word2int.get(i)] for i in premise] hypothesis_seq = [ value[word2int.get(i)] for i in hypothesis ] for exp in premise_seq: reg += dy.sum_elems(dy.pow(exp, dy.scalarInput(2))) for exp in hypothesis_seq: reg += dy.sum_elems(dy.pow(exp, dy.scalarInput(2))) # compute regularization on the bilstm terms for i in range(2): weights_prem_fw = self.fw_premise_builder.get_parameter_expressions( )[0][i] weights_prem_bw = self.bw_premise_builder.get_parameter_expressions( )[0][i] weights_hypo_fw = self.fw_hypo_builder.get_parameter_expressions( )[0][i] weights_hypo_bw = self.bw_hypo_builder.get_parameter_expressions( )[0][i] reg += dy.sum_elems(dy.pow(weights_prem_fw, dy.scalarInput(2))) reg += dy.sum_elems(dy.pow(weights_prem_bw, dy.scalarInput(2))) reg += dy.sum_elems(dy.pow(weights_hypo_fw, dy.scalarInput(2))) reg += dy.sum_elems(dy.pow(weights_hypo_bw, dy.scalarInput(2))) return reg
def MRT_batch_update(batch, epoch): dy.renew_cg() alpha = dy.scalarInput(alpha_p) batch_loss = [] rewards = [] for sample in batch: lemma = sample.lemma word = sample.word word_str = sample.word_str feats = sample.pos, sample.feats actions = sample.actions # ORACLE PREDICTION #loss, prediction_b, predicted_actions_b = \ gold_loss, _, _ = \ self.transducer.transduce(lemma, feats, actions, external_cg=True) gold_loss = dy.esum(gold_loss) #if gold_loss.scalar_value() < -50.: # Sum log P # print 'Dangerously low prob of gold action seq: ', gold_loss.scalar_value(), word_str # hypotheses = [] #else: # hypotheses = [ (_, gold_loss, word_str, actions) ] # BEAM-SEARCH-BASED PREDICTION #hypotheses += self.transducer.beam_search_decode(lemma, feats, external_cg=True, # beam_width=beam_width) sample_rewards = [-1.] sample_losses = [gold_loss] predictions = [word_str] seen_predicted_acts = {tuple(actions)} #for _, loss, prediction, predicted_actions in hypotheses: for _ in range(sample_size): loss, prediction, predicted_actions = \ self.transducer.transduce(lemma, feats, sampling=True, external_cg=True) predicted_actions = tuple(predicted_actions) if predicted_actions in seen_predicted_acts: #if verbose: print 'already sampled this action sequence: ', predicted_actions continue loss = dy.esum(loss) if loss.scalar_value() < -20: # log P continue else: seen_predicted_acts.add(predicted_actions) #for _, loss, prediction, predicted_actions in hypotheses: # COMPUTE REWARDS reward = compute_reward(word, word_str, prediction) sample_rewards.append(reward) sample_losses.append(loss) predictions.append(prediction) # SCALE & RENORMALIZE: (these are log P) if len(sample_rewards) == 1 and sample_rewards[0] == -1.: if verbose: print 'Nothing to update with.' continue else: #if verbose: print 'sample_losses', sample_losses sample_losses = dy.concatenate(sample_losses) sample_rewards = dy.inputVector(sample_rewards) q_unnorm = dy.pow(dy.exp(sample_losses), alpha) q = dy.cdiv(q_unnorm, dy.sum_elems(q_unnorm)) if verbose: print 'q', q.npvalue() print 'sample_rewards', sample_rewards.npvalue() print 'word', word_str print 'predictions: ', u', '.join(predictions) batch_loss.append(dy.dot_product(q, sample_rewards)) if batch_loss: batch_loss = dy.esum(batch_loss) loss = batch_loss.scalar_value() # forward try: batch_loss.backward() self.trainer.update() except Exception, e: print 'Batch loss: ', loss print 'q', q.npvalue() print 'q_unnorm', q_unnorm.npvalue() print 'gold_loss', gold_loss.scalar_value() print 'sample_rewards', sample_rewards.npvalue() print 'word', word_str print 'predictions: ', u', '.join(predictions) raise e if verbose: print 'Batch loss: ', loss
def __call__(self, x): return 0.5 * x * (1 + dy.tanh( math.sqrt(2 / math.pi) * (x + 0.044715 * dy.pow(x, 3))))
def hallucinate_tags(tweet, sample=False, print_loss=False): dy.renew_cg() # Transduce all batch elements with an LSTM src = tweet # initialize the LSTM init_state_src = lstm_encode.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([embed[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mu_tweet = dy.parameter(W_mu_tweet_p) V_mu_tweet = dy.parameter(V_mu_tweet_p) b_mu_tweet = dy.parameter(b_mu_tweet_p) W_sig_tweet = dy.parameter(W_sig_tweet_p) V_sig_tweet = dy.parameter(V_sig_tweet_p) b_sig_tweet = dy.parameter(b_sig_tweet_p) # Compute tweet encoding mu_tweet = mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet) log_var_tweet = mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet) #W_mu_tag = dy.parameter(W_mu_tag_p) #V_mu_tag = dy.parameter(V_mu_tag_p) #b_mu_tag = dy.parameter(b_mu_tag_p) #W_sig_tag = dy.parameter(W_sig_tag_p) #V_sig_tag = dy.parameter(V_sig_tag_p) #b_sig_tag = dy.parameter(b_sig_tag_p) # Compute tag encoding #tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags),)), (NUM_TAGS,)) #mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag), DROPOUT) #log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag), DROPOUT) # Combine encodings for mean and diagonal covariance W_mu = dy.parameter(W_mu_p) b_mu = dy.parameter(b_mu_p) W_sig = dy.parameter(W_sig_p) b_sig = dy.parameter(b_sig_p) mu_tag = dy.zeros(HIDDEN_DIM) log_var_tag = dy.zeros(HIDDEN_DIM) mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])]) log_var = dy.affine_transform( [b_sig, W_sig, dy.concatenate([log_var_tweet, log_var_tag])]) # KL-Divergence loss computation kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) if print_loss: print("kl loss/word={:.4f}".format(kl_loss.value() / len(tweet))) z = reparameterize(mu, log_var) # now step through the output sentence # all_losses = [] #current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)]) #prev_word = src[0] #W_sm = dy.parameter(W_tweet_softmax_p) #b_sm = dy.parameter(b_tweet_softmax_p) #for next_word in src[1:]: # # feed the current state into the # current_state = current_state.add_input(embed[prev_word]) # output_embedding = current_state.output() # s = dy.affine_transform([b_sm, W_sm, output_embedding]) # all_losses.append(dy.pickneglogsoftmax(s, next_word)) # prev_word = next_word #softmax_loss = dy.esum(all_losses) W_hidden = dy.parameter(W_hidden_p) b_hidden = dy.parameter(b_hidden_p) W_out = dy.parameter(W_tag_output_p) b_out = dy.parameter(b_tag_output_p) h = dy.tanh(b_hidden + W_hidden * z) o = dy.logistic(b_out + W_out * h) tag_ranks = o.value() # Sample from tags if sample: print('Sampling') gen_tags = [] for i, p in enumerate(tag_ranks): if random.random() < p: gen_tags.append(i) return gen_tags else: return tag_ranks
def calc_loss(sent, epsilon=0.0): #dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] tags = sent[1] # initialize the LSTM init_state_src = lstm_encode.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([embed[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mu_tweet = dy.parameter(W_mu_tweet_p) V_mu_tweet = dy.parameter(V_mu_tweet_p) b_mu_tweet = dy.parameter(b_mu_tweet_p) W_sig_tweet = dy.parameter(W_sig_tweet_p) V_sig_tweet = dy.parameter(V_sig_tweet_p) b_sig_tweet = dy.parameter(b_sig_tweet_p) # Compute tweet encoding mu_tweet = dy.dropout(mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet), DROPOUT) log_var_tweet = dy.dropout( mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet), DROPOUT) W_mu_tag = dy.parameter(W_mu_tag_p) V_mu_tag = dy.parameter(V_mu_tag_p) b_mu_tag = dy.parameter(b_mu_tag_p) W_sig_tag = dy.parameter(W_sig_tag_p) V_sig_tag = dy.parameter(V_sig_tag_p) b_sig_tag = dy.parameter(b_sig_tag_p) # Compute tag encoding tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags), )), (NUM_TAGS, )) mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag), DROPOUT) log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag), DROPOUT) # Combine encodings for mean and diagonal covariance W_mu = dy.parameter(W_mu_p) b_mu = dy.parameter(b_mu_p) W_sig = dy.parameter(W_sig_p) b_sig = dy.parameter(b_sig_p) # Slowly phase out getting both inputs if random.random() < epsilon: mask = dy.zeros(HIDDEN_DIM) else: mask = dy.ones(HIDDEN_DIM) if random.random() < 0.5: mu_tweet = dy.cmult(mu_tweet, mask) log_var_tweet = dy.cmult(log_var_tweet, mask) else: mu_tag = dy.cmult(mu_tag, mask) log_var_tag = dy.cmult(log_var_tag, mask) mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])]) log_var = dy.affine_transform( [b_sig, W_sig, dy.concatenate([log_var_tweet, log_var_tag])]) # KL-Divergence loss computation kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)]) prev_word = src[0] W_sm = dy.parameter(W_tweet_softmax_p) b_sm = dy.parameter(b_tweet_softmax_p) for next_word in src[1:]: # feed the current state into the current_state = current_state.add_input(embed[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) # Slowly phase out teacher forcing (this may be slow??) if random.random() < epsilon: p = dy.softmax(s).npvalue() prev_word = np.random.choice(VOCAB_SIZE, p=p / p.sum()) else: prev_word = next_word softmax_loss = dy.esum(all_losses) W_hidden = dy.parameter(W_hidden_p) b_hidden = dy.parameter(b_hidden_p) W_out = dy.parameter(W_tag_output_p) b_out = dy.parameter(b_tag_output_p) h = dy.dropout(dy.tanh(b_hidden + W_hidden * z), DROPOUT) o = dy.logistic(b_out + W_out * h) crossentropy_loss = dy.binary_log_loss(o, tags_tensor) return kl_loss, softmax_loss, crossentropy_loss
def hallucinate_tweet(given_tags): dy.renew_cg() # Transduce all batch elements with an LSTM tags = given_tags # initialize the LSTM #init_state_src = lstm_encode.initial_state() # get the output of the first LSTM #src_output = init_state_src.add_inputs([embed[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. #W_mu_tweet = dy.parameter(W_mu_tweet_p) #V_mu_tweet = dy.parameter(V_mu_tweet_p) #b_mu_tweet = dy.parameter(b_mu_tweet_p) #W_sig_tweet = dy.parameter(W_sig_tweet_p) #V_sig_tweet = dy.parameter(V_sig_tweet_p) #b_sig_tweet = dy.parameter(b_sig_tweet_p) # Compute tweet encoding #mu_tweet = mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet) #log_var_tweet = mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet) W_mu_tag = dy.parameter(W_mu_tag_p) V_mu_tag = dy.parameter(V_mu_tag_p) b_mu_tag = dy.parameter(b_mu_tag_p) W_sig_tag = dy.parameter(W_sig_tag_p) V_sig_tag = dy.parameter(V_sig_tag_p) b_sig_tag = dy.parameter(b_sig_tag_p) # Compute tag encoding tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags), )), (NUM_TAGS, )) mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag), DROPOUT) log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag), DROPOUT) # Combine encodings for mean and diagonal covariance W_mu = dy.parameter(W_mu_p) b_mu = dy.parameter(b_mu_p) W_sig = dy.parameter(W_sig_p) b_sig = dy.parameter(b_sig_p) mu_tweet = dy.zeros(HIDDEN_DIM) log_var_tweet = dy.zeros(HIDDEN_DIM) mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])]) log_var = dy.affine_transform( [b_sig, W_sig, dy.concatenate([log_var_tweet, log_var_tag])]) # KL-Divergence loss computation kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)]) prev_word = vocab[START] W_sm = dy.parameter(W_tweet_softmax_p) b_sm = dy.parameter(b_tweet_softmax_p) gen_tweet = [] for i in range(20): # feed the current state into the current_state = current_state.add_input(embed[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) p = dy.softmax(s).npvalue() next_word = np.random.choice(VOCAB_SIZE, p=p / p.sum()) gen_tweet.append(next_word) prev_word = next_word return gen_tweet
def MRT_batch_update(batch, epoch): dy.renew_cg() alpha = dy.scalarInput(alpha_p) batch_loss = [] rewards = [] for sample in batch: lemma = sample.lemma word = sample.word word_str = sample.word_str feats = sample.pos, sample.feats actions = sample.actions # ORACLE PREDICTION #loss, prediction_b, predicted_actions_b = \ gold_loss, _, _ = \ self.transducer.transduce(lemma, feats, actions, external_cg=True) gold_loss = dy.esum(gold_loss) #if gold_loss.scalar_value() < -50.: # Sum log P # print 'Dangerously low prob of gold action seq: ', gold_loss.scalar_value(), word_str # hypotheses = [] #else: # hypotheses = [ (_, gold_loss, word_str, actions) ] # BEAM-SEARCH-BASED PREDICTION #hypotheses += self.transducer.beam_search_decode(lemma, feats, external_cg=True, # beam_width=beam_width) if action_penalty: # we will add edit cost to penalize long and intuitively wasteful actions sample_actions = [cost_actions(actions)] sample_rewards = [-1.] sample_losses = [gold_loss] predictions = [word_str] seen_predicted_acts = {tuple(actions)} #for _, loss, prediction, predicted_actions in hypotheses: for _ in range(sample_size): loss, prediction, predicted_actions = \ self.transducer.transduce(lemma, feats, sampling=True, external_cg=True) predicted_actions = tuple(predicted_actions) if predicted_actions in seen_predicted_acts: #if verbose: print 'already sampled this action sequence: ', predicted_actions continue loss = dy.esum(loss) if loss.scalar_value() < -20: # log P continue else: seen_predicted_acts.add(predicted_actions) #for _, loss, prediction, predicted_actions in hypotheses: # COMPUTE REWARDS reward = compute_reward(word, word_str, prediction) sample_rewards.append(reward) sample_losses.append(loss) predictions.append(prediction) if action_penalty: sample_actions.append(cost_actions(predicted_actions)) # SCALE & RENORMALIZE: (these are log P) if len(sample_rewards) == 1 and sample_rewards[0] == -1.: if verbose: print('Nothing to update with.') continue else: if action_penalty: #X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) # min-max scaling to [0, 1] #print 'Sampled actions: ', sample_actions if len(set(sample_actions)) == 1: sample_actions = np.zeros_like(sample_actions) else: sample_actions = np.array(sample_actions) min_score = np.min(sample_actions) max_score = np.max(sample_actions) sample_actions = (sample_actions - min_score) / ( max_score - min_score) #print 'Sampled actions: ', sample_actions sample_rewards = (1 - action_penalty) * np.array( sample_rewards) + action_penalty * sample_actions #print 'Sampled rewards: ', sample_rewards #if verbose: print 'sample_losses', sample_losses sample_losses = dy.concatenate(sample_losses) sample_rewards = dy.inputVector(sample_rewards) q_unnorm = dy.pow(dy.exp(sample_losses), alpha) q = dy.cdiv(q_unnorm, dy.sum_elems(q_unnorm)) if verbose: print('q', q.npvalue()) print('sample_rewards', sample_rewards.npvalue()) print('word', word_str) print('predictions: ', u', '.join(predictions)) batch_loss.append(dy.dot_product(q, sample_rewards)) if batch_loss: batch_loss = dy.esum(batch_loss) loss = batch_loss.scalar_value() # forward try: batch_loss.backward() self.trainer.update() except Exception as e: print('Batch loss: ', loss) print('q', q.npvalue()) print('q_unnorm', q_unnorm.npvalue()) print('gold_loss', gold_loss.scalar_value()) print('sample_rewards', sample_rewards.npvalue()) print('word', word_str) print('predictions: ', u', '.join(predictions)) raise e if verbose: print('Batch loss: ', loss) else: if verbose: print('Batch loss is zero.') loss = 0. return loss