def calc_score_of_history(words): # Lookup the embeddings and concatenate them emb = dy.concatenate([W_emb[x] for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def __call__(self, obs, batched=False): out = self.network(obs, batched) W, b = dy.parameter(self.W), dy.parameter(self.b) As = dy.affine_transform([b, W, out]) if self.dueling: W_extra, b_extra = dy.parameter(self.W_extra), dy.parameter(self.b_extra) V = dy.affine_transform([b_extra, W_extra, out]) return As, V return As
def calc_score_of_history(words, dropout=0.0): # Lookup the embeddings and concatenate them emb = dy.concatenate([W_emb[x] for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # CHANGE 2: perform dropout if dropout != 0.0: h = dy.dropout(h, dropout) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def calc_score_of_histories(words, dropout=0.0): # This will change from a list of histories, to a list of words in each history position words = np.transpose(words) # Lookup the embeddings and concatenate them emb = dy.concatenate([dy.lookup_batch(W_emb, x) for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # Perform dropout if dropout != 0.0: h = dy.dropout(h, dropout) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def generate(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent #get the output of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] attention_matrix = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) attention_matrix.append(alignment) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent, dy.concatenate_cols(attention_matrix).value()
def generate(sent): dy.renew_cg() src = sent #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: #feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word return dy.esum(all_losses)
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output() #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def calc_reinforce_loss(words, tags, delta): dy.renew_cg() # Transduce all batch elements with an LSTM word_reps = LSTM.transduce([LOOKUP[x] for x in words]) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) #calculate the probability distribution scores = [dy.affine_transform([b, W, x]) for x in word_reps] losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)] probs = [-dy.exp(loss).as_array() for loss in losses] #then take samples from the probability distribution samples = [np.random.choice(range(len(x)), p=x) for x in probs] #calculate accuracy=reward correct = [sample == tag for sample, tag in zip(samples, tags)] r_i = float(sum(correct))/len(correct) r = dy.constant((1), r_i) # Reward baseline for each word W_bl = dy.parameter(W_bl_p) b_bl = dy.parameter(b_bl_p) r_b = [dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps] #we need to take the value in order to break the computation graph #as the reward portion is trained seperatley and not backpropogated through during the overall score rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b] #the scores for training the baseline baseline_scores = [dy.square(r - x) for x in r_b] #then calculate the reinforce scores using reinforce reinforce_scores = [r_s*score for r_s, score in zip(rewards_over_baseline, scores)] #we want the first len(sent)-delta scores from xent then delta scores from reinforce #for mixer if len(scores) > delta: mixer_scores = scores[:len(scores)-delta] + reinforce_scores[delta-1:] else: mixer_scores = reinforce_scores return dy.esum(mixer_scores), dy.esum(baseline_scores)
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def calc_scores(words): dy.renew_cg() # Transduce all batch elements with an LSTM word_reps = LSTM.transduce([LOOKUP[x] for x in words]) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) scores = [dy.affine_transform([b, W, x]) for x in word_reps] return scores
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols(src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify(dy.affine_transform([b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify(dy.affine_transform([b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify(dy.affine_transform([b_label_hidden_to_head, W_label_hidden_to_head, src_encodings])) h_label_dep = dy.rectify(dy.affine_transform([b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def sent_lm_loss(self, sent): rnn_cur = self.rnn.initial_state() losses = [] prev_word = self.start for word in sent: x_t = self.embeddings[prev_word] rnn_cur = rnn_cur.add_input(x_t) logits = dy.affine_transform([self.lb, self.h2l, rnn_cur.output()]) losses.append(dy.pickneglogsoftmax(logits, word)) prev_word = word return dy.esum(losses)
def __call__(self, obs, batched=False): out = obs if isinstance(obs, dy.Expression) else dy.inputTensor(obs, batched=batched) for i in range(self.n_layers): b, W = dy.parameter(self.bs[i]), dy.parameter(self.Ws[i]) out = dy.affine_transform([b, W, out]) if self.layer_norm and i != self.n_layers - 1: out = dy.layer_norm(out, self.ln_gs[i], self.ln_bs[i]) if self.specified_activation: if self.activation[i] is not None: out = self.activation[i](out) else: out = self.activation(out) return out
def build_graph(self, x, is_train): # dy.renew_cg() if is_train: embeddings = [ dy.dropout(self.word_embedding[w], self.dropout) for w in x ] else: embeddings = [self.word_embedding[w] for w in x] lstm_out = self.bilstm.transduce(embeddings) features = [ dy.affine_transform([self.linear_bias, self.linear_w, rep]) for rep in lstm_out ] return features
def __call__(self, obs, batched=False): out = obs if isinstance(obs, dy.Expression) else dy.inputTensor( obs, batched=batched) for i in range(self.n_layers): b, W = dy.parameter(self.bs[i]), dy.parameter(self.Ws[i]) out = dy.affine_transform([b, W, out]) if self.layer_norm and i != self.n_layers - 1: out = dy.layer_norm(out, self.ln_gs[i], self.ln_bs[i]) if self.specified_activation: if self.activation[i] is not None: out = self.activation[i](out) else: out = self.activation(out) return out
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() # convolutional layer src = padding(src, src.dim()[0][0], src.dim()[0][1], self.filter_width, self.stride, src.dim()[1]) l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filter_conv), stride = [self.stride, self.stride], is_valid = True)) timestep = l1.dim()[0][1] features = l1.dim()[0][2] batch_size = l1.dim()[1] # transpose l1 to be (timesetp, dim), but keep the batch_size. rhn_in = dy.reshape(l1, (timestep, features), batch_size = batch_size) rhn_in = [dy.pick(rhn_in, i) for i in range(timestep)] for l in range(self.rhn_num_hidden_layers): rhn_out = [] # initialize a random vector for the first state vector, keep the same batch size. prev_state = dy.parameter(self.init[l]) # begin recurrent high way network for t in range(timestep): for m in range(0, self.rhn_microsteps): H = dy.affine_transform([dy.parameter(self.recur[l][m][1]), dy.parameter(self.recur[l][m][0]), prev_state]) T = dy.affine_transform([dy.parameter(self.recur[l][m][3]), dy.parameter(self.recur[l][m][2]), prev_state]) if m == 0: H += dy.parameter(self.linear[l][0]) * rhn_in[t] T += dy.parameter(self.linear[l][1]) * rhn_in[t] H = dy.tanh(H) T = dy.logistic(T) prev_state = dy.cmult(1 - T, prev_state) + dy.cmult(T, H) # ((1024, ), batch_size) rhn_out.append(prev_state) if self.residual and l>0: rhn_out = [sum(x) for x in zip(rhn_out, rhn_in)] rhn_in = rhn_out # Compute the attention-weighted average of the activations rhn_in = dy.concatenate_cols(rhn_in) scores = dy.transpose(dy.parameter(self.attention[0][1]))*dy.tanh(dy.parameter(self.attention[0][0])*rhn_in) # ((1,510), batch_size) scores = dy.reshape(scores, (scores.dim()[0][1],), batch_size = scores.dim()[1]) attn_out = rhn_in*dy.softmax(scores) # # rhn_in.as_tensor() is ((1024,510), batch_size) softmax is ((510,), batch_size) return ExpressionSequence(expr_tensor = attn_out)
def calc_scores_with_previous_tag(words, referent_tags=None): """ Calculate scores using previous tag as input. If the referent tags are provided, then we will sample from previous referent tag or previous system prediction. :param words: :param referent_tags: :return: """ dy.renew_cg() word_embs = [LOOKUP[x] for x in words] # Transduce all batch elements for the backward LSTM, using the original word embeddings. bwd_init = bwdLSTM.initial_state() bwd_word_reps = bwd_init.transduce(reversed(word_embs)) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) scores = [] # Transduce one by one for the forward LSTM fwd_init = fwdLSTM.initial_state() s_fwd = fwd_init prev_tag = start_tag index = 0 for word, bwd_word_rep in zip(word_embs, reversed(bwd_word_reps)): # Concatenate word and tag representation just as training. fwd_input = dy.concatenate([word, TAG_LOOKUP[prev_tag]]) s_fwd = s_fwd.add_input(fwd_input) combined_rep = dy.concatenate([s_fwd.output(), bwd_word_rep]) score = dy.affine_transform([b, W, combined_rep]) prediction = np.argmax(score.npvalue()) if referent_tags: if sampler.sample_true(): prev_tag = referent_tags[index] else: prev_tag = prediction index += 1 else: prev_tag = prediction scores.append(score) return scores
def batch_loss(self, batch, train=True): # load the parameters W_hid = dy.parameter(self.W_hid) b_hid = dy.parameter(self.b_hid) W_out = dy.parameter(self.W_out) losses = [] for _, sent in batch: for i in range(self.n_word_context, len(sent)): prev_word_ix = sent[i - 1] curr_word_ix = sent[i] ct1 = dy.lookup(self.embed, prev_word_ix) ctx = ct1 if self.n_word_context == 2: prev_prev_word_ix = sent[i - 2] ct2 = dy.lookup(self.embed, prev_prev_word_ix) ctx = dy.concatenate([ct1, ct2]) elif self.n_word_context == 3: prev_prev_prev_word_ix = sent[i - 3] ct3 = dy.lookup(self.embed, prev_prev_prev_word_ix) prev_prev_word_ix = sent[i - 2] ct2 = dy.lookup(self.embed, prev_prev_word_ix) ctx = dy.concatenate([ct1, ct2, ct3]) # hid is the hidden layer output, size=hidden_size # compute b_hid + W_hid * ctx, but faster hid = dy.affine_transform([b_hid, W_hid, ctx]) hid = dy.tanh(hid) # out is the prediction of the next word, size=vocab_size out = W_out * hid # Intepretation: The model estimates that # log P(curr_word=k | prev_word) ~ out[k] # in other words, # P(curr_word=k | prev_word) = exp(out[k]) / sum_j exp(out[j]) # = softmax(out)[k] # We want to maximize the probability of the correct word. # (equivalently, minimize the negative log-probability) loss = dy.pickneglogsoftmax(out, curr_word_ix) losses.append(loss) # esum simply adds up the expressions in the list return dy.esum(losses)
def beam_search_generate(self, src_seq, beam_n=5): dynet.renew_cg() embedded = self.embed_seq(src_seq) input_vectors = self.encode_seq(embedded) w = dynet.parameter(self.decoder_w) b = dynet.parameter(self.decoder_b) s = self.dec_lstm.initial_state() s = s.add_input(input_vectors[-1]) beams = [{"state": s, "out": [], "err": 0}] completed_beams = [] while len(completed_beams) < beam_n: potential_beams = [] for beam in beams: if len(beam["out"]) > 0: embed_vector = self.tgt_lookup[beam["out"][-1].i] s = beam["state"].add_input(embed_vector) out_vector = dynet.affine_transform([b, w, s.output()]) probs = dynet.softmax(out_vector) probs = probs.vec_value() for potential_next_i in range(len(probs)): potential_beams.append({ "state": s, "out": beam["out"] + [self.tgt_vocab[potential_next_i]], "err": beam["err"] - math.log(probs[potential_next_i]) }) potential_beams.sort(key=lambda x: x["err"]) beams = potential_beams[:beam_n - len(completed_beams)] completed_beams = completed_beams + [ beam for beam in beams if beam["out"][-1] == self.tgt_vocab.END_TOK or len(beam["out"]) > 5 * len(src_seq) ] beams = [ beam for beam in beams if beam["out"][-1] != self.tgt_vocab.END_TOK and len(beam["out"]) <= 5 * len(src_seq) ] completed_beams.sort(key=lambda x: x["err"]) return [beam["out"] for beam in completed_beams]
def get_decode_loss(self, src_encodings, decoder_init, tgt_sents): W_s = dy.parameter(self.W_s) b_s = dy.parameter(self.b_s) W_h = dy.parameter(self.W_h) b_h = dy.parameter(self.b_h) W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) tgt_words, tgt_masks = input_transpose(tgt_sents) batch_size = len(tgt_sents) decoder_init_cell = W_s * decoder_init + b_s s = self.dec_builder.initial_state( [decoder_init_cell, dy.tanh(decoder_init_cell)]) ctx_tm1 = dy.vecInput(self.args.hidden_size * 2) losses = [] # start from <S>, until y_{T-1} for t, (y_ref_t, mask_t) in enumerate(zip(tgt_words[1:], tgt_masks[1:]), start=1): y_tm1_embed = dy.lookup_batch(self.tgt_lookup, tgt_words[t - 1]) x = dy.concatenate([y_tm1_embed, ctx_tm1]) s = s.add_input(x) h_t = s.output() ctx_t, alpha_t = self.attention(src_encodings, h_t, batch_size) # read_out = dy.tanh(W_h * dy.concatenate([h_t, ctx_t]) + b_h) read_out = dy.tanh( dy.affine_transform([b_h, W_h, dy.concatenate([h_t, ctx_t])])) if args.dropout > 0.: read_out = dy.dropout(read_out, args.dropout) y_t = W_y * read_out + b_y loss_t = dy.pickneglogsoftmax_batch(y_t, y_ref_t) if 0 in mask_t: mask_expr = dy.inputVector(mask_t) mask_expr = dy.reshape(mask_expr, (1, ), batch_size) loss_t = loss_t * mask_expr losses.append(loss_t) ctx_tm1 = ctx_t loss = dy.esum(losses) loss = dy.sum_batches(loss) / batch_size return loss
def get_attention_state(self, main, states): if not states: return main affinities = [self.get_affinity(main, state) for state in states] states_exp = dy.transpose(dy.concatenate_cols(states)) scores_exp = dy.softmax(dy.concatenate(affinities)) context = None if len(states) == 1: context = dy.transpose(states_exp) else: context = dy.transpose(states_exp)*scores_exp Wa = dy.parameter(self.Wa) b = dy.parameter(self.b) hidden_state = dy.affine_transform([b, Wa, dy.concatenate([main, context])]) return hidden_state
def calc_loss_basic(self, frames, label): # Renew the computation graph dy.renew_cg() # Initialize LSTM init_state_src = self.lstm_builder.initial_state() # Instantiate the params W_mean = dy.parameter(self.W_mean_p) V_mean = dy.parameter(self.V_mean_p) b_mean = dy.parameter(self.b_mean_p) W_var = dy.parameter(self.W_var_p) V_var = dy.parameter(self.V_var_p) b_var = dy.parameter(self.b_var_p) input_frames = dy.inputTensor(frames) output_label = label # Get the LSTM embeddings src_output = init_state_src.add_inputs( [frame for frame in input_frames])[-1].output() # Get the mean and diagonal log covariance from the encoder mu = self.mlp(src_output, W_mean, V_mean, b_mean) log_var = self.mlp(src_output, W_mean, V_mean, b_mean) # Compute the KL Divergence loss kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) # Reparameterize z = self.reparameterize(mu, log_var) W_sm = dy.parameter(self.W_sm_p) b_sm = dy.parameter(self.b_sm_p) # Calculate the reconstruction loss pred = dy.affine_transform([b_sm, W_sm, z]) label_embedding = self.lookup[label] #print label, label_embedding recons_loss = dy.pickneglogsoftmax(pred, label) return kl_loss, recons_loss
def train_example(self, example): """ Parameters ---------- example : tuple A single training example of form (encoder_inputs, ground_labels) with initial padding to offset the encoder inputs. Output ------- Loss for example. """ encoder_input = example[0] ground_labels = example[1] context_outputs = self.encoding(encoder_input) R = dy.parameter(self.R) b = dy.parameter(self.b) # Decoding: losses = [] for (context_output, ground_label) in zip(context_outputs, ground_labels): # context_ouput : state from single timestep of context_encoder # ground_label : ground truth labels for given sentence (for teacher forcing) decoder_input = [self.vocab.index("<START>")] + ground_label decoder_target = ground_label + [self.vocab.index("<END>")] embedded_decoder_input = [ self.embeddings[word] for word in decoder_input ] decoder_initial_state = self.output_decoder.initial_state( vecs=[context_output, context_output]) decoder_output = decoder_initial_state.transduce( embedded_decoder_input) log_probs_char = [ dy.affine_transform([b, R, h_t]) for h_t in decoder_output ] for (log_prob, target) in zip(log_probs_char, decoder_target): losses.append(dy.pickneglogsoftmax(log_prob, target)) loss = dy.esum(losses) return loss
def linear_layer(exp, weights, biases=None): """ Linear layer with weights and biases. Inputs: exp (dy.Expression): A Dynet tensor. params (dy.Parameters): Dynet parameters representing weights (a matrix). biases (dy.Parameters, optional): Dynet parameters representing biases (a vector). Returns: dy.Expression representing exp * weights + biases """ if biases: return dy.affine_transform([add_dim(biases), add_dim(exp) if is_vector(exp) else exp, weights]) else: return linear_transform(exp, weights)
def step(self, x, hx, cx): if not self.test: if self.dropout_x > 0: x = dy.cmult(self.dropout_mask_x, x) if self.dropout_h > 0: hx = dy.cmult(self.dropout_mask_h, hx) gates = dy.affine_transform( [self.bias, self.weight_ih, x, self.weight_hh, hx]) i = dy.pickrange(gates, 0, self.n_hidden) f = dy.pickrange(gates, self.n_hidden, self.n_hidden * 2) g = dy.pickrange(gates, self.n_hidden * 2, self.n_hidden * 3) o = dy.pickrange(gates, self.n_hidden * 3, self.n_hidden * 4) i, f, g, o = dy.logistic(i), dy.logistic(f), dy.tanh(g), dy.logistic(o) cy = dy.cmult(f, cx) + dy.cmult(i, g) hy = dy.cmult(o, dy.tanh(cy)) return hy, cy
def initialize(self, src_encodings, training=True): if training and self.rnn_dropout > 0.: self.decoder.set_dropout(self.rnn_dropout) else: self.decoder.disable_dropout() with parameters(self.src_enc_trans_W, self.src_enc_trans_b, trainable=training) as (src_enc_trans_W, src_enc_trans_b): last_enc_state = src_encodings[-1] decoder_init_state = dy.tanh( dy.affine_transform( [src_enc_trans_b, src_enc_trans_W, last_enc_state])) state = self.decoder.initial_state( [decoder_init_state, decoder_init_state]) return state
def loss(self, observation, instance): #trans = instance.transformation #if trans not in self.known_transformations: #k newtrans = list(self.param_dict.keys())[0][0] ### SUPER ARBITRARY #k tqdm.write("WARNING: unknown transformtion picked for instance {}; using transformation {}".format(trans, newtrans)) #k trans = newtrans trans = 'lul' b = dy.parameter(self.param_dict[(trans, 'b')]) W = dy.parameter(self.param_dict[(trans, 'W')]) features, label = observation prediction = dy.softmax(dy.affine_transform([b, W, dy.inputVector(features)])) loss = -dy.log(dy.pick(prediction, label)) return prediction, loss
def generate(sent): dy.renew_cg() # Transduce all batch elements with an LSTM sent_reps = [ LSTM_SRC.transduce([LOOKUP_SRC[x] for x in src])[-1] for src, trg in sents ] dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s( [src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = hidden_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) probs = -dy.log_softmax(s).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent
def decode_batch(self, encoding, output_batch): """ Batch decoding function :param encoding: last hidden state from encoder :param output_batch: list of output sentences in format [word1, word2..] :return: loss """ w = dynet.parameter(self.decoder_w) b = dynet.parameter(self.decoder_b) s = self.dec_lstm.initial_state().add_input(encoding) losses = [] maxSentLength = max([len(sent) for sent in output_batch]) wids = [] masks = [] for j in range(maxSentLength): wids.append([(self.tgt_vocab[sent[j]].i if len(sent)>j else self.tgt_vocab.END_TOK.i) for sent in output_batch]) mask = [(1 if len(sent)>j else 0) for sent in output_batch] masks.append(mask) for wid, mask in zip(wids, masks): # apply dropout y = s.output() if args.dropout: y = dynet.dropout(y, self.args.dropout) # calculate the softmax and loss score = dynet.affine_transform([b, w, y]) loss = dynet.pickneglogsoftmax_batch(score, wid) # mask the loss if at least one sentence is shorter than maxSentLength if 0 in mask: mask_expr = dynet.inputVector(mask) mask_expr = dynet.reshape(mask_expr, (1,), len(mask)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN embed_vector = dynet.lookup_batch(self.tgt_lookup, wid) s = s.add_input(embed_vector) return dynet.sum_batches(dynet.esum(losses))
def _compute(self, input): if not self.bias: output = self.W * input else: output = dy.affine_transform([self.b, self.W, input]) if self.act == 'linear': return output elif self.act == 'sigmoid': return sigmoid(output) elif self.act == 'tanh': return tanh(output) elif self.act == 'ptanh': return penalized_tanh(output) elif self.act == 'relu': return relu(output) elif self.act == 'elu': return elu(output) raise ValueError('Unknown activation function :' + self.act)
def generate(self, src, sampled=False): embedding = self.embed_seq(src) encoding = self.encode_seq(embedding)[-1] w = dynet.parameter(self.decoder_w) b = dynet.parameter(self.decoder_b) s = self.dec_lstm.initial_state().add_input(encoding) out = [] for _ in range(5 * len(src)): out_vector = dynet.affine_transform([b, w, s.output()]) probs = dynet.softmax(out_vector) selection = np.argmax(probs.value()) out.append(self.tgt_vocab[selection]) if out[-1].s == self.tgt_vocab.END_TOK: break embed_vector = self.tgt_lookup[selection] s = s.add_input(embed_vector) return out
def BuildLMGraph(self, sents): dy.renew_cg() # initialize the RNN init_state = self.builder.initial_state() # parameters -> expressions R = dy.parameter(self.R) bias = dy.parameter(self.bias) S = vocab.w2i["<s>"] # get the cids and masks for each step tot_chars = 0 cids = [] masks = [] for i in range(len(sents[0])): cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent) > i else 0) for sent in sents] masks.append(mask) tot_chars += sum(mask) # start the rnn with "<s>" init_ids = cids[0] s = init_state.add_input(lookup_batch(self.lookup, init_ids)) losses = [] # feed char vectors into the RNN and predict the next char for cid, mask in zip(cids[1:], masks[1:]): score = dy.affine_transform([bias, R, s.output()]) loss = dy.pickneglogsoftmax_batch(score, cid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1, ), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN cemb = dy.lookup_batch(self.lookup, cid) s = s.add_input(cemb) return dy.sum_batches(dy.esum(losses)), tot_chars
def transduce(self, embed_sent): src = embed_sent.as_tensor() W = dy.parameter(self.pW) b = dy.parameter(self.pb) l1 = dy.affine_transform([b, W, src]) output = l1 if self.nonlinearity is 'linear': output = l1 elif self.nonlinearity is 'sigmoid': output = dy.logistic(l1) elif self.nonlinearity is 'tanh': output = 2 * dy.logistic(l1) - 1 elif self.nonlinearity is 'relu': output = dy.rectify(l1) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def decode_batch(self, encoding, output_batch): w = dynet.parameter(self.decoder_w) b = dynet.parameter(self.decoder_b) s = self.dec_lstm.initial_state() losses = [] maxSentLength = max([len(sent) for sent in output_batch]) wids = [] masks = [] for j in range(maxSentLength): wids.append([(self.tgt_vocab[sent[j]].i if len(sent) > j else self.tgt_vocab.END_TOK.i) for sent in output_batch]) mask = [(1 if len(sent) > j else 0) for sent in output_batch] masks.append(mask) s = s.add_input( dynet.concatenate( [encoding[-1], dynet.vecInput(self.args.hidden_dim * 2)])) for wid, mask in zip(wids, masks): # calculate the softmax and loss score = dynet.affine_transform([b, w, s.output()]) loss = dynet.pickneglogsoftmax_batch(score, wid) # mask the loss if at least one sentence is shorter if 0 in mask: mask_expr = dynet.inputVector(mask) mask_expr = dynet.reshape(mask_expr, (1, ), len(output_batch)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN embed_vector = dynet.lookup_batch(self.tgt_lookup, wid) attn_vector = self.attend(encoding, s) inp = dynet.concatenate([embed_vector, attn_vector]) s = s.add_input(inp) return dynet.sum_batches(dynet.esum(losses))
def BuildLMGraph(self, sents): dy.renew_cg() # initialize the RNN init_state = self.builder.initial_state() # parameters -> expressions R = dy.parameter(self.R) bias = dy.parameter(self.bias) S = vocab.w2i["<s>"] # get the cids and masks for each step tot_chars = 0 cids = [] masks = [] for i in range(len(sents[0])): cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent)>i else 0) for sent in sents] masks.append(mask) tot_chars += sum(mask) # start the rnn with "<s>" init_ids = cids[0] s = init_state.add_input(lookup_batch(self.lookup, init_ids)) losses = [] # feed char vectors into the RNN and predict the next char for cid, mask in zip(cids[1:], masks[1:]): score = dy.affine_transform([bias, R, s.output()]) loss = dy.pickneglogsoftmax_batch(score, cid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN cemb = dy.lookup_batch(self.lookup, cid) s = s.add_input(cemb) return dy.sum_batches(dy.esum(losses)), tot_chars
def get_score(self, tree_encoder): seq_len = len(tree_encoder.all_states) encode_repr = dy.concatenate_cols([item.output for item in tree_encoder.all_states]) #[dy.inputTensor(np.zeros(self.hidden_dim, 1), batch=False)] + W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(self.mlp_head_b) dep_rel, head_rel = leaky_relu(dy.affine_transform([b_dep, W_dep, encode_repr])),leaky_relu(dy.affine_transform([b_head, W_head, encode_repr])) # if self._train_flag: # dep_rel, head_rel= dy.dropout_dim(dep_rel, 1, self.dropout_mlp), dy.dropout_dim(head_rel, 1, self.dropout_mlp) W_rel = dy.parameter(self.rel_W) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, 1, num_outputs = len(self.rels), bias_x = True, bias_y = True) # (#head x rel_size x #dep) x batch_size rel_logits = dy.transpose(rel_logits, dims=[0, 2, 1]) return rel_logits, rel_logits.value()
def calc_lm_loss(sents): dy.renew_cg() # parameters -> expressions #W_exp = dy.parameter(W_sm) #b_exp = dy.parameter(b_sm) # initialize the RNN f_init = RNN.initial_state() # get the wids and masks for each step tot_words = 0 wids = [] masks = [] for i in range(len(sents[0])): wids.append([(sent[i] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent) > i else 0) for sent in sents] masks.append(mask) tot_words += sum(mask) # start the rnn by inputting "<s>" init_ids = [S] * len(sents) s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids)) # feed word vectors into the RNN and predict the next word losses = [] for wid, mask in zip(wids, masks): # calculate the softmax and loss score = dy.affine_transform([b_sm, W_sm, s.output()]) loss = dy.pickneglogsoftmax_batch(score, wid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1, ), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN wemb = dy.lookup_batch(WORDS_LOOKUP, wid) s = s.add_input(wemb) return dy.sum_batches(dy.esum(losses)), tot_words
def px(self, state, utterance): """ Calculate the probability of utterance given state. """ R = dy.parameter(self.R) b = dy.parameter(self.b) decoder_input = [self.vocab.index("<START>")] + utterance embedded_decoder_input = [ dy.concatenate([self.embeddings[word], state]) for word in decoder_input ] decoder_initial_state = self.output_decoder.initial_state() decoder_output = decoder_initial_state.transduce( embedded_decoder_input) log_probs_char = [ dy.affine_transform([b, R, h_t]) for h_t in decoder_output ] return log_probs_char
def calc_loss_basic(self, embedding, label): # Renew the computation graph dy.renew_cg() # Instantiate the params W_mean = dy.parameter(self.W_mean_p) V_mean = dy.parameter(self.V_mean_p) b_mean = dy.parameter(self.b_mean_p) W_var = dy.parameter(self.W_var_p) V_var = dy.parameter(self.V_var_p) b_var = dy.parameter(self.b_var_p) input_embedding = dy.inputTensor(embedding) output_label = label # Get the LSTM embeddings src_output = self.dnn.predict(input_embedding) # Get the mean and diagonal log covariance from the encoder mu = self.mlp(src_output, W_mean, V_mean, b_mean) log_var = self.mlp(src_output, W_mean, V_mean, b_mean) # Compute the KL Divergence loss kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) # Reparameterize z = self.reparameterize(mu, log_var) W_sm = dy.parameter(self.W_sm_p) b_sm = dy.parameter(self.b_sm_p) # Calculate the reconstruction loss pred = dy.selu(dy.affine_transform([b_sm, W_sm, z])) label_embedding = self.lookup[label] recons_loss = dy.pickneglogsoftmax(pred, label) return kl_loss, recons_loss
def get_sentence_level_bilstm_outputs( self, combined_word_representations, which_layer_to_use_for_morpho_disamb): """ This function produces the context representations at each level given the word representations for each word and returns the last layer's output and the specific layer output which we want to use for morphological disambiguation. :param combined_word_representations: :param which_layer_to_use_for_morpho_disamb: xyz :type which_layer_to_use_for_morpho_disamb: int :return: two outputs: 1) layer output to be used for NER loss, 2) layer output to be used for MD loss """ last_layer_context_representations, multilayered_context_representations = \ self.sentence_level_bilstm_layer.transduce(combined_word_representations) last_layer_context_representations = [dynet.tanh(dynet.affine_transform([self.tanh_layer_b.expr(), self.tanh_layer_W.expr(), context])) \ for context in last_layer_context_representations] return last_layer_context_representations, \ multilayered_context_representations[which_layer_to_use_for_morpho_disamb-1]
def init_params(self, src_encH, batch_size, train): if train: self.lstmbuilder.set_dropout(self.dropout) else: self.lstmbuilder.disable_dropout() self.Wp = self.Wp_p.expr() self.bp = self.bp_p.expr() self.Wo = self.Wo_p.expr() self.bo = self.bo_p.expr() self.E = self.E_p.expr() self.b = self.b_p.expr(False) self.batch_size = batch_size last_enc = dy.pick(src_encH, index=src_encH.dim()[0][-1] - 1, dim=1) init_state = dy.affine_transform([self.bp, self.Wp, last_enc]) init_state = [ init_state, dy.zeros((self.hidden_dims, ), batch_size=batch_size) ] self.dec_state = self.lstmbuilder.initial_state(init_state) if train: self.lstmbuilder.set_dropout_masks(batch_size)
def init(self, H, y, usr, test=True, update=True): bs = len(y[0]) if not test: self.lstm.set_dropout(self.dr) else: self.lstm.disable_dropout() # Initialize first state of the decoder with the last state of the encoder self.Wp = self.Wp_p.expr(update) self.bp = self.bp_p.expr(update) last_enc = dy.pick(H, index=H.dim()[0][-1] - 1, dim=1) init_state = dy.affine_transform([self.bp, self.Wp, last_enc]) init_state = [dy.zeros(self.dh, batch_size=bs), init_state] self.ds = self.lstm.initial_state(init_state, update=update) # Initialize dropout masks if not test: self.lstm.set_dropout_masks(bs) self.Wo = dy.contract3d_1d_bias(self.To_p.expr(update), usr, self.Wo_p.expr(update)) self.bo = self.bo_p.expr(update) self.E = self.E_p.expr(update) self.b = self.b_p.expr(False)
def translate(self, bisents, beam_size, max_output_len, length_norm_alpha, output_file, relative, absolute, local, candidate): avg_fan_outs = [] total_fan_outs = [] with open(output_file, 'w') as output: for i in range(len(bisents)): print("Translating sentence", i) src_sent = bisents[i][0] dy.renew_cg() self.encode([src_sent]) self.decoder.init( dy.affine_transform([ dy.parameter(self.b_bridge), dy.parameter(self.W_bridge), self.encoder.final_state() ])) beam_search = BeamSearch(beam_size, max_output_len, length_norm_alpha) beam_search.set_pruning_strategy(relative, absolute, local, candidate) k_best_output, avg_fan_out, total_fan_out, num_pruned = beam_search.search( self) print("pruned:", num_pruned) print("avg fan out:", avg_fan_out) print("total fan out:", total_fan_out) # remove start and end symbols words = k_best_output[1:-1] if k_best_output[ -1] == self.tgt_vocab.eos else k_best_output[1:] output_sent = [self.tgt_vocab.i2w[word] for word in words] avg_fan_outs.append(avg_fan_out) total_fan_outs.append(total_fan_out) output.write(" ".join(output_sent) + '\n') if (i + 1) % 100 == 0: output.flush() print("avg avg fan out:", sum(avg_fan_outs) / len(avg_fan_outs)) print("avg total fan out:", sum(total_fan_outs) / len(total_fan_outs))
def expr_for_tree(self, tree, decorate=False, training=True): if tree.isleaf(): raise RuntimeError('Tree structure error: meet with leaves') if len(tree.children) == 1: if not tree.children[0].isleaf(): raise RuntimeError( 'Tree structure error: tree nodes with one child should be a leaf' ) emb = self.E[self.w2i.get(tree.children[0].label, 0)] Wi, Wo, Wu = [dy.parameter(w) for w in self.WS] bi, bo, bu, _ = [dy.parameter(b) for b in self.BS] i = dy.logistic(dy.affine_transform([bi, Wi, emb])) o = dy.logistic(dy.affine_transform([bo, Wo, emb])) u = dy.tanh(dy.affine_transform([bu, Wu, emb])) c = dy.cmult(i, u) h = dy.cmult(o, dy.tanh(c)) if decorate: tree._e = h return h, c if len(tree.children) != 2: raise RuntimeError( 'Tree structure error: only binary trees are supported.') e1, c1 = self.expr_for_tree(tree.children[0], decorate) e2, c2 = self.expr_for_tree(tree.children[1], decorate) Ui, Uo, Uu = [dy.parameter(u) for u in self.US] Uf1, Uf2 = [dy.parameter(u) for u in self.UFS] bi, bo, bu, bf = [dy.parameter(b) for b in self.BS] e = dy.concatenate([e1, e2]) i = dy.logistic(dy.affine_transform([bi, Ui, e])) o = dy.logistic(dy.affine_transform([bo, Uo, e])) f1 = dy.logistic(dy.affine_transform([bf, Uf1, e])) f2 = dy.logistic(dy.affine_transform([bf, Uf2, e])) u = dy.tanh(dy.affine_transform([bu, Uu, e])) c = dy.cmult(i, u) + dy.cmult(f1, c1) + dy.cmult(f2, c2) h = dy.cmult(o, dy.tanh(c)) if decorate: tree._e = h return h, c
def calc_lm_loss(sents): dy.renew_cg() # initialize the RNN f_init = RNN.initial_state() # get the wids and masks for each step tot_words = 0 wids = [] masks = [] for i in range(len(sents[0])): wids.append([(sent[i] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent) > i else 0) for sent in sents] masks.append(mask) tot_words += sum(mask) # start the rnn by inputting "<s>" init_ids = [S] * len(sents) s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids)) # feed word vectors into the RNN and predict the next word losses = [] for wid, mask in zip(wids, masks): # calculate the softmax and loss score = dy.affine_transform([b_exp, W_exp, s.output()]) loss = dy.pickneglogsoftmax_batch(score, wid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN wemb = dy.lookup_batch(WORDS_LOOKUP, wid) s = s.add_input(wemb) return dy.sum_batches(dy.esum(losses)), tot_words
def calc_scores(words): """ Calculate scores using BiLSTM. :param words: :return: """ dy.renew_cg() word_embs = [LOOKUP[x] for x in words] # Transduce all batch elements with an LSTM fwd_init = fwdLSTM.initial_state() fwd_word_reps = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_word_reps = bwd_init.transduce(reversed(word_embs)) combined_word_reps = [dy.concatenate([f, b]) for f, b in zip(fwd_word_reps, reversed(bwd_word_reps))] # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) scores = [dy.affine_transform([b, W, x]) for x in combined_word_reps] return scores
def highway(input_, train): for func, weight, bias in zip(funcs, weights, biases): proj = dy.rectify(func(input_, train)) transform = dy.logistic(dy.affine_transform([bias, weight, input_])) input_ = dy.cmult(transform, proj) + dy.cmult(input_, 1 - transform) return input_
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words