def test_update(self): ones=np.ones((10, 10)) updated = np.ones((10, 10)) * 0.99 gradient = np.ones((10, 10)) * 0.01 dy.renew_cg() pp1 = dy.parameter(self.p1) pp2 = dy.parameter(self.p2) a = pp1 * self.lp1[1] b = pp2 * self.lp2[1] l = dy.dot_product(a, b) / 100 self.assertEqual(l.scalar_value(),10,msg=str(l.scalar_value())) l.backward() self.assertTrue(np.allclose(self.p1.grad_as_array(), 0.1 * ones),msg=np.array_str(self.p1.grad_as_array())) self.assertTrue(np.allclose(self.p2.grad_as_array(), 0.1 * ones),msg=np.array_str(self.p2.grad_as_array())) self.assertTrue(np.allclose(self.lp1.grad_as_array()[1], ones[0]),msg=np.array_str(self.lp1.grad_as_array())) self.assertTrue(np.allclose(self.lp2.grad_as_array()[1], ones[0]),msg=np.array_str(self.lp2.grad_as_array())) self.trainer.update() self.assertTrue(np.allclose(self.p1.as_array(), ones * 0.99),msg=np.array_str(self.p1.as_array())) self.assertTrue(np.allclose(self.p2.as_array(), ones * 0.99),msg=np.array_str(self.p2.as_array())) self.assertTrue(np.allclose(self.lp1.as_array()[1], ones[0] * 0.9),msg=np.array_str(self.lp1.as_array()[1])) self.assertTrue(np.allclose(self.lp2.as_array()[1], ones[0] * 0.9),msg=np.array_str(self.lp2.as_array()))
def generate(input, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): def sample(probs): rnd = random.random() for i, p in enumerate(probs): rnd -= p if rnd <= 0: break return i embedded = embed_sentence(input) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 for i in range(len(input)*2): if count_EOS == 2: break vector = dy.concatenate([attend(encoded, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) probs = probs.vec_value() next_char = sample(probs) last_output_embeddings = output_lookup[next_char] if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def expr_for_tree(self, tree): if tree.isleaf(): return self.E[self.w2i.get(tree.label,0)] if len(tree.children) == 1: assert(tree.children[0].isleaf()) emb = self.expr_for_tree(tree.children[0]) Wi,Wo,Wu = [dy.parameter(w) for w in self.WS] bi,bo,bu,_ = [dy.parameter(b) for b in self.BS] i = dy.logistic(Wi*emb + bi) o = dy.logistic(Wo*emb + bo) u = dy.tanh( Wu*emb + bu) c = dy.cmult(i,u) expr = dy.cmult(o,dy.tanh(c)) return expr assert(len(tree.children) == 2),tree.children[0] e1 = self.expr_for_tree(tree.children[0]) e2 = self.expr_for_tree(tree.children[1]) Ui,Uo,Uu = [dy.parameter(u) for u in self.US] Uf1,Uf2 = [dy.parameter(u) for u in self.UFS] bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] e = dy.concatenate([e1,e2]) i = dy.logistic(Ui*e + bi) o = dy.logistic(Uo*e + bo) f1 = dy.logistic(Uf1*e1 + bf) f2 = dy.logistic(Uf2*e2 + bf) u = dy.tanh( Uu*e + bu) c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2) h = dy.cmult(o,dy.tanh(c)) expr = h return expr
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def generate(in_seq, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): embedded = embed_sentence(in_seq) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 for i in range(len(in_seq)*2): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_char = probs.index(max(probs)) last_output_embeddings = output_lookup[next_char] if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def generate(sent): dy.renew_cg() src = sent #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: #feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word return dy.esum(all_losses)
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component): w1_att_src = dy.parameter(w1_att_src_p) w1_att_tgt = dy.parameter(w1_att_tgt_p) w2_att = dy.parameter(w2_att_p) a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att alignment = dy.softmax(a_t) att_output = src_output_matrix * alignment return att_output, alignment
def __call__(self, obs, batched=False): out = self.network(obs, batched) W, b = dy.parameter(self.W), dy.parameter(self.b) As = dy.affine_transform([b, W, out]) if self.dueling: W_extra, b_extra = dy.parameter(self.W_extra), dy.parameter(self.b_extra) V = dy.affine_transform([b_extra, W_extra, out]) return As, V return As
def __call__(self, x): assert(isinstance(x, dy.Expression)) self.W = dy.parameter(self.pW) # add parameters to graph as expressions # m2.add_parameters((8, len(inputs))) self.b = dy.parameter(self.pb) self.x = x return self.W * self.x + self.b
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output() #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def renew_cg(self): # renew the compute graph for every single instance dy.renew_cg() param_exprs = dict() param_exprs['U'] = dy.parameter(self.params['word_score_U']) param_exprs['pW'] = dy.parameter(self.params['predict_W']) param_exprs['pb'] = dy.parameter(self.params['predict_b']) param_exprs['<bos>'] = dy.parameter(self.params['<BoS>']) self.param_exprs = param_exprs
def calc_scores(words): dy.renew_cg() # Transduce all batch elements with an LSTM word_reps = LSTM.transduce([LOOKUP[x] for x in words]) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) scores = [dy.affine_transform([b, W, x]) for x in word_reps] return scores
def __call__(self, obs, batched=False): out = obs if isinstance(obs, dy.Expression) else dy.inputTensor(obs, batched=batched) for i in range(self.n_layers): b, W = dy.parameter(self.bs[i]), dy.parameter(self.Ws[i]) out = dy.affine_transform([b, W, out]) if self.layer_norm and i != self.n_layers - 1: out = dy.layer_norm(out, self.ln_gs[i], self.ln_bs[i]) if self.specified_activation: if self.activation[i] is not None: out = self.activation[i](out) else: out = self.activation(out) return out
def calc_scores_with_previous_tag(words, referent_tags=None): """ Calculate scores using previous tag as input. If the referent tags are provided, then we will sample from previous referent tag or previous system prediction. :param words: :param referent_tags: :return: """ dy.renew_cg() word_embs = [LOOKUP[x] for x in words] # Transduce all batch elements for the backward LSTM, using the original word embeddings. bwd_init = bwdLSTM.initial_state() bwd_word_reps = bwd_init.transduce(reversed(word_embs)) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) scores = [] # Transduce one by one for the forward LSTM fwd_init = fwdLSTM.initial_state() s_fwd = fwd_init prev_tag = start_tag index = 0 for word, bwd_word_rep in zip(word_embs, reversed(bwd_word_reps)): # Concatenate word and tag representation just as training. fwd_input = dy.concatenate([word, TAG_LOOKUP[prev_tag]]) s_fwd = s_fwd.add_input(fwd_input) combined_rep = dy.concatenate([s_fwd.output(), bwd_word_rep]) score = dy.affine_transform([b, W, combined_rep]) prediction = np.argmax(score.npvalue()) if referent_tags: if sampler.sample_true(): prev_tag = referent_tags[index] else: prev_tag = prediction index += 1 else: prev_tag = prediction scores.append(score) return scores
def generate(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent #get the output of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] attention_matrix = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) attention_matrix.append(alignment) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent, dy.concatenate_cols(attention_matrix).value()
def word_assoc_score(self, source_idx, target_idx, relation): """ NOTE THAT DROPOUT IS BEING APPLIED HERE :param source_idx: embedding index of source atom :param target_idx: embedding index of target atom :param relation: relation type :return: score """ # prepare s = self.embeddings[source_idx] if self.no_assoc: A = dy.const_parameter(self.word_assoc_weights[relation]) else: A = dy.parameter(self.word_assoc_weights[relation]) dy.dropout(A, self.dropout) t = self.embeddings[target_idx] # compute if self.mode == BILINEAR_MODE: return dy.transpose(s) * A * t elif self.mode == DIAG_RANK1_MODE: diag_A = dyagonalize(A[0]) rank1_BC = A[1] * dy.transpose(A[2]) ABC = diag_A + rank1_BC return dy.transpose(s) * ABC * t elif self.mode == TRANSLATIONAL_EMBED_MODE: return -dy.l2_norm(s - t + A) elif self.mode == DISTMULT: return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
def attend(input_mat, state, w1dt): global attention_w2 global attention_v w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) # input_mat: (encoder_state x seqlen) => input vecs concatenated as cols # w1dt: (attdim x seqlen) # w2dt: (attdim x attdim) w2dt = w2*dy.concatenate(list(state.s())) # att_weights: (seqlen,) row vector unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) att_weights = dy.softmax(unnormalized) # context: (encoder_state) context = input_mat * att_weights return context
def attend(input_vectors, state): global attention_w1 global attention_w2 global attention_v w1 = dy.parameter(attention_w1) w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) attention_weights = [] w2dt = w2*dy.concatenate(list(state.s())) for input_vector in input_vectors: attention_weight = v*dy.tanh(w1*input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors
def ergm_score(self): """ :return: ERGM score (dynet Expression) computed based on ERGM weights and features only Does not populate any field """ W = dy.parameter(self.ergm_weights) f = dy.transpose(dy.inputVector([self.feature_vals[k] for k in self.feature_set])) return f * W
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def calc_reinforce_loss(words, tags, delta): dy.renew_cg() # Transduce all batch elements with an LSTM word_reps = LSTM.transduce([LOOKUP[x] for x in words]) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) #calculate the probability distribution scores = [dy.affine_transform([b, W, x]) for x in word_reps] losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)] probs = [-dy.exp(loss).as_array() for loss in losses] #then take samples from the probability distribution samples = [np.random.choice(range(len(x)), p=x) for x in probs] #calculate accuracy=reward correct = [sample == tag for sample, tag in zip(samples, tags)] r_i = float(sum(correct))/len(correct) r = dy.constant((1), r_i) # Reward baseline for each word W_bl = dy.parameter(W_bl_p) b_bl = dy.parameter(b_bl_p) r_b = [dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps] #we need to take the value in order to break the computation graph #as the reward portion is trained seperatley and not backpropogated through during the overall score rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b] #the scores for training the baseline baseline_scores = [dy.square(r - x) for x in r_b] #then calculate the reinforce scores using reinforce reinforce_scores = [r_s*score for r_s, score in zip(rewards_over_baseline, scores)] #we want the first len(sent)-delta scores from xent then delta scores from reinforce #for mixer if len(scores) > delta: mixer_scores = scores[:len(scores)-delta] + reinforce_scores[delta-1:] else: mixer_scores = reinforce_scores return dy.esum(mixer_scores), dy.esum(baseline_scores)
def BuildLMGraph(self, sents): dy.renew_cg() # initialize the RNN init_state = self.builder.initial_state() # parameters -> expressions R = dy.parameter(self.R) bias = dy.parameter(self.bias) S = vocab.w2i["<s>"] # get the cids and masks for each step tot_chars = 0 cids = [] masks = [] for i in range(len(sents[0])): cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent)>i else 0) for sent in sents] masks.append(mask) tot_chars += sum(mask) # start the rnn with "<s>" init_ids = cids[0] s = init_state.add_input(lookup_batch(self.lookup, init_ids)) losses = [] # feed char vectors into the RNN and predict the next char for cid, mask in zip(cids[1:], masks[1:]): score = dy.affine_transform([bias, R, s.output()]) loss = dy.pickneglogsoftmax_batch(score, cid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN cemb = dy.lookup_batch(self.lookup, cid) s = s.add_input(cemb) return dy.sum_batches(dy.esum(losses)), tot_chars
def build_tagging_graph(words): dy.renew_cg() # parameters -> expressions H = dy.parameter(pH) O = dy.parameter(pO) # initialize the RNNs f_init = fwdRNN.initial_state() b_init = bwdRNN.initial_state() cf_init = cFwdRNN.initial_state() cb_init = cBwdRNN.initial_state() # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. wembs = [word_rep(w, cf_init, cb_init) for w in words] wembs = [dy.noise(we,0.2) for we in wembs] # optional # feed word vectors into biLSTM fw_exps = f_init.transduce(wembs) bw_exps = b_init.transduce(reversed(wembs)) # OR # fw_exps = [] # s = f_init # for we in wembs: # s = s.add_input(we) # fw_exps.append(s.output()) # bw_exps = [] # s = b_init # for we in reversed(wembs): # s = s.add_input(we) # bw_exps.append(s.output()) # biLSTM states bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))] # feed each biLSTM state to an MLP exps = [] for x in bi_exps: r_t = O*(dy.tanh(H * x)) exps.append(r_t) return exps
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: vector = dy.concatenate([attend(vectors, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def create_network_return_best(inputs): ''' inputs is a list of numbers ''' dy.renew_cg() W = dy.parameter(pW) b = dy.parameter(pB) if(len(inputs) > documentLength): inputs = inputs[0:documentLength] emb_vectors = [lookup[i] for i in inputs] while(len(emb_vectors) < documentLength): pad = dy.vecInput(embDimension) pad.set(np.zeros(embDimension)) emb_vectors.append(pad) net_input = dy.concatenate(emb_vectors) net_output = dy.softmax( (W*net_input) + b) return np.argmax(net_output.npvalue())
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def expr_for_tree(self, tree): if tree.isleaf(): return self.E[self.w2i.get(tree.label,0)] if len(tree.children) == 1: assert(tree.children[0].isleaf()) expr = self.expr_for_tree(tree.children[0]) return expr assert(len(tree.children) == 2),tree.children[0] e1 = self.expr_for_tree(tree.children[0]) e2 = self.expr_for_tree(tree.children[1]) W = dy.parameter(self.W) expr = dy.tanh(W*dy.concatenate([e1,e2])) return expr
def create_network_return_loss(inputs, expected_output): ''' inputs is a list of numbers ''' dy.renew_cg() W = dy.parameter(pW) # from parameters to expressions b = dy.parameter(pB) if(len(inputs) > documentLength): inputs = inputs[0:documentLength] emb_vectors = [lookup[i] for i in inputs] while(len(emb_vectors) < documentLength): pad = dy.vecInput(embDimension) pad.set(np.zeros(embDimension)) emb_vectors.append(pad) net_input = dy.concatenate(emb_vectors) net_output = dy.softmax( (W*net_input) + b) loss = -dy.log(dy.pick(net_output, expected_output)) return loss
def sample(self, first=1, nchars=0, stop=-1): res = [first] dy.renew_cg() state = self.builder.initial_state() R = dy.parameter(self.R) bias = dy.parameter(self.bias) cw = first while True: x_t = dy.lookup(self.lookup, cw) state = state.add_input(x_t) y_t = state.output() r_t = bias + (R * y_t) ydist = dy.softmax(r_t) dist = ydist.vec_value() rnd = random.random() for i,p in enumerate(dist): rnd -= p if rnd <= 0: break res.append(i) cw = i if cw == stop: break if nchars and len(res) > nchars: break return res
WEMB_DIM = 128 RNN_HIDDEN_DIM = 64 HIDDEN_DIM = 32 pWembs = model.add_lookup_parameters((num_words, WEMB_DIM)) pH = model.add_parameters((HIDDEN_DIM, RNN_HIDDEN_DIM)) pHb = model.add_parameters(HIDDEN_DIM) pO = model.add_parameters((num_tags, HIDDEN_DIM)) pOb = model.add_parameters(num_tags) rnn_builder = dy.BiRNNBuilder(1, WEMB_DIM, RNN_HIDDEN_DIM, model, dy.LSTMBuilder) dy.renew_cg() H = dy.parameter(pH) Hb = dy.parameter(pHb) O = dy.parameter(pO) Ob = dy.parameter(pOb) indexed_words, indexed_gold_tags = zip(*[(w2i[w], t2i[t]) for w, t in train_sentence]) wembs = [pWembs[wi] for wi in indexed_words] noised_wembs = [dy.noise(we, 0.1) for we in wembs] rnn_outputs = rnn_builder.transduce(noised_wembs) errs = [] for rnn_output, gold_tag in zip(rnn_outputs, indexed_gold_tags): hidden = dy.tanh(dy.affine_transform([Hb, H, rnn_output]))
def transduce(self, input, _true_output=None, feats=None): # convert _true_output string to list of vocabulary indeces if _true_output: try: true_output = [self.char_vocab.w2i[a] for a in _true_output] except: print a print _true_output true_output += [self.STOP] true_output = list(reversed(true_output)) R = dy.parameter(self.R) # hidden to vocabulary bias = dy.parameter(self.bias) W_c = dy.parameter(self.W_c) W__a = dy.parameter(self.W__a) U__a = dy.parameter(self.U__a) v__a = dy.parameter(self.v__a) # biLSTM encoder of input string input = [BEGIN_CHAR] + [c for c in input] + [STOP_CHAR] input_emb = [] for char_ in reversed(input): char_id = self.char_vocab.w2i.get(char_, self.UNK) char_embedding = self.VOCAB_LOOKUP[char_id] input_emb.append(char_embedding) biencoder = self.bilstm_transduce(self.fbuffRNN, self.bbuffRNN, input_emb) losses = [] output = [] pred_history = [self.BEGIN] # < s = self.decoder.initial_state() while not len(pred_history) == MAX_PRED_SEQ_LEN: # compute probability over vocabulary and choose a prediction # either from the true prediction at train time or based on the model at test time # decoder next state prev_pred_id = pred_history[-1] s = s.add_input(self.VOCAB_LOOKUP[prev_pred_id]) # soft attention vector scores = [ v__a * dy.tanh(W__a * s.output() + U__a * h_input) for h_input in biencoder ] alphas = dy.softmax(dy.concatenate(scores)) c = dy.esum([ h_input * dy.pick(alphas, j) for j, h_input in enumerate(biencoder) ]) # softmax over vocabulary h_output = dy.tanh(W_c * dy.concatenate([s.output(), c])) probs = dy.softmax(R * h_output + bias) if _true_output is None: pred_id = np.argmax(probs.npvalue()) else: pred_id = true_output.pop() losses.append(-dy.log(dy.pick(probs, pred_id))) pred_history.append(pred_id) if pred_id == self.STOP: break else: pred_char = self.char_vocab.i2w.get(pred_id, UNK_CHAR) output.append(pred_char) output = u''.join(output) return ((dy.average(losses) if losses else None), output)
def run_test(train_path, test_path, test_output_path, hidden_layer, embedding_size, context_size, saved_model_path, activate_sub_word=False): from time import gmtime, strftime print strftime("%Y-%m-%d %H:%M:%S", gmtime()) TRAIN, VOCAB, LABELS = t1.read_data(train_path) # get training set TEST, VOCAB_TEST = read_test_data(test_path) # get dev set VOCAB["<BOS>"] = 2 # a word for representing the beginning of a sentence VOCAB["<EOS>"] = 2 # a word for representing the end of a sentence VOCAB["<UNK>"] = 1 # change TEST dataset by replacing words that did not appear in the vocab with <UNK> TEST_ORIG = list(TEST) TEST = [] for sentence in TEST_ORIG: new_sentence = [] for word in sentence: word = word.lower() if word in VOCAB.keys(): new_sentence.append(word) else: new_sentence.append("<UNK>") TEST.append(new_sentence) # if the feature for taking into account sub words is activated if activate_sub_word == True: for word in VOCAB.keys(): if word not in ["<BOS>", "<EOS>", "<UNK>"]: words_from_word = get_sub_words(word) for part_word in words_from_word: if VOCAB.has_key(part_word) == False: VOCAB[part_word] = 1 m = dy.ParameterCollection() # create parameter collection #define parameters pW1 = m.add_parameters((hidden_layer, embedding_size * context_size)) pb1 = m.add_parameters(hidden_layer) pW2 = m.add_parameters((len(LABELS), hidden_layer)) pb2 = m.add_parameters(len(LABELS)) params = [pW1, pb1, pW2, pb2] e = m.add_lookup_parameters((len(VOCAB), embedding_size)) # # load the parameters print dy.parameter(pW1).value() m.populate(saved_model_path) print dy.parameter(pW1).value() L2I = {l: i for i, l in enumerate(list(sorted(LABELS.keys()))) } # enumerate the labels as 0,1,2,... F2I = {f: i for i, f in enumerate(list(sorted(VOCAB.keys()))) } # enumerate the vocabulary as 0,1,2,... predictions = prediction(TEST, L2I, F2I, params, e, activate_sub_word) # write predictions to file file = open(test_output_path, "w") for idx, sentence in enumerate(predictions): word_idx = 0 for word, label in sentence: orig_word = TEST_ORIG[idx][word_idx] file.write(orig_word + " " + label + "\n") word_idx += 1 file.write("\n")
def transduce(self, lemma, feats, oracle_actions=None, external_cg=True, sampling=False, unk_avg=True, verbose=False): """ Transduce an encoded lemma and features. Args: lemma: The input lemma, a list of integer character codes. feats: The features determining the morphological transformation. The most common format is a list of integer codes, one code per feature-value pair. oracle_actions: `None` means prediction. List of action codes is a static oracle. A dictionary of keys (explained below) is the config for a dynamic oracle. * "target_word": List of action codes for the target word form. * "loss": Which loss function to use (softmax-margin, NLL, MSE). * "rollout_mixin_beta": How to mix reference and learned roll-outs (1 is only reference, 0 is only model). * "global_rollout": Whether to use one type of roll-out (expert or model) at the sequence level. * "optimal": Whether to use an optimal or noisy (=buggy) expert * "bias_inserts": Whether to use a buggy roll-out for inserts (which makes them as cheap as copies) external_cg: Whether or not an external computation graph is defined. sampling: Whether or not sampling should be used for decoding (e.g. for MRT) or training (e.g. dynamic oracles with exploration / learned roll-ins). dynamic: Whether or not `oracle_actions` is a static oracle (list of actions) or a confuguration for a dynamic oracle. unk_avg: Whether or not to average all char embeddings to produce UNK embedding (see `self._build_lemma`). verbose: Whether or not to report on processing steps. """ # Returns an expression of the loss for the sequence of actions. # (that is, the oracle_actions if present or the predicted sequence otherwise) def _valid_actions(encoder): valid_actions = [] if len(encoder) > 1: valid_actions += [COPY, DELETE] else: valid_actions += [END_WORD] valid_actions += self.INSERTS return valid_actions if not external_cg: dy.renew_cg() dynamic = None # indicates prediction or static if oracle_actions: # if not, then prediction if isinstance(oracle_actions, dict): # dynamic oracle: # @TODO NB target word is not wrapped in boundary tags target_word = oracle_actions['target_word'] generation_errors = set() dynamic = oracle_actions else: # static oracle: # reverse to enable simple popping oracle_actions = oracle_actions[::-1] oracle_actions.pop() # COPY of BEGIN_WORD_CHAR # vectorize lemma lemma_enc = self._build_lemma(lemma, unk_avg, is_training=bool(oracle_actions)) # vectorize features features = self._build_features(*feats) # add encoder and decoder to computation graph encoder = Encoder(self.fbuffRNN, self.bbuffRNN) decoder = self.wordRNN.initial_state() # add classifier to computation graph if self.MLP_DIM: # decoder output to hidden W_s2h = dy.parameter(self.pW_s2h) b_s2h = dy.parameter(self.pb_s2h) # hidden to action W_act = dy.parameter(self.pW_act) b_act = dy.parameter(self.pb_act) # encoder is a stack which pops lemma characters and their # representations from the top. Thus, to get lemma characters # in the right order, the lemma has to be reversed. encoder.transduce(lemma_enc, lemma) encoder.pop() # BEGIN_WORD_CHAR action_history = [COPY] word = [] losses = [] if verbose and not dynamic: count = 0 print() print(action2string(oracle_actions, self.vocab)) print(lemma2string(lemma, self.vocab)) if dynamic: # use model rollout for the whole of this sequence rollout_on = dynamic['global_rollout'] and np.random.rand() > dynamic['rollout_mixin_beta'] while len(action_history) <= MAX_ACTION_SEQ_LEN: if verbose and not dynamic: print('Action: ', count, self.vocab.act.i2w[action_history[-1]]) print('Encoder length, char: ', lemma, len(encoder), self.vocab.char.i2w[encoder.s[-1][-1]]) print('word: ', ''.join(word)) print(('Remaining actions: ', oracle_actions, action2string(oracle_actions, self.vocab))) count += 1 # compute probability of each of the actions and choose an action # either from the oracle or if there is no oracle, based on the model valid_actions = _valid_actions(encoder) encoder_embedding = encoder.embedding() # decoder decoder_input = dy.concatenate([encoder_embedding, features, self.ACT_LOOKUP[action_history[-1]] ]) decoder = decoder.add_input(decoder_input) # classifier if self.double_feats: classifier_input = dy.concatenate([decoder.output(), features]) else: classifier_input = decoder.output() if self.MLP_DIM: h = self.NONLIN(W_s2h * classifier_input + b_s2h) else: h = classifier_input logits = W_act * h + b_act # get action (argmax, sampling, or use oracle actions) if oracle_actions is None: # predicting by argmax or sampling logits_cpu = logits # dy.to_device(logits, 'CPU') log_probs = dy.log_softmax(logits, valid_actions) log_probs_np = log_probs.npvalue() if sampling: action = sample(log_probs_np) else: action = np.argmax(log_probs_np) losses.append(dy.pick(log_probs, action)) elif dynamic: # training with dynamic oracle if rollout_on or (not dynamic['global_rollout'] and np.random.rand() > dynamic['rollout_mixin_beta']): # the second disjunct allows for model roll-out applied locally logits_cpu = logits # dy.to_device(logits, 'CPU') rollout = lambda action: self.rollout(action, dy.log_softmax(logits, valid_actions), action_history, features, decoder, encoder, word, W_act, b_act) # @TODO W_s2h ... else: rollout = None optim_actions, costs = oracle_with_rollout(word, target_word, encoder.get_extra(), valid_actions, rollout, self.vocab, optimal=dynamic['optimal'], bias_inserts=dynamic['bias_inserts'], errors=generation_errors, verbose=verbose) logits_cpu = logits # dy.to_device(logits, 'CPU') log_probs = dy.log_softmax(logits, valid_actions) log_probs_np = log_probs.npvalue() if sampling == 1. or np.random.rand() <= sampling: # action is picked by sampling action = sample(log_probs_np) # @TODO IL learned roll-ins are done with policy i.e. greedy / beam search decoding if verbose: print('Rolling in with model: ', action, self.vocab.act.i2w[action]) else: # action is picked from optim_actions action = optim_actions[np.argmax([log_probs_np[a] for a in optim_actions])] #print [log_probs_np[a] for a in optim_actions] # loss is over all optimal actions. if dynamic['loss'] == 'softmax-margin': loss = log_sum_softmax_margin_loss(optim_actions, logits, self.NUM_ACTS, costs=costs, valid_actions=None, verbose=verbose) elif dynamic['loss'] == 'nll': loss = log_sum_softmax_loss(optim_actions, logits, self.NUM_ACTS, valid_actions=valid_actions, verbose=verbose) elif dynamic['loss'] == 'mse': loss = cost_sensitive_reg_loss(optim_actions, logits, self.NUM_ACTS, # NB expects both costs and valid actions! costs=costs, valid_actions=valid_actions, verbose=verbose) ################ else: raise NotImplementedError losses.append(loss) #print 'Action' #print action #print self.vocab.act.i2w[action] else: # training with static oracle action = oracle_actions.pop() log_probs = dy.log_softmax(logits, valid_actions) losses.append(dy.pick(log_probs, action)) action_history.append(action) #print 'action, log_probs: ', action, self.vocab.act.i2w[action], losses[-1].scalar_value(), log_probs.npvalue() # execute the action to update the transducer state if action == COPY: # 1. Increment attention index try: char_ = encoder.pop() except IndexError as e: print(np.exp(log_probs.npvalue())) print('COPY: ', action) # 2. Append copied character to the output word word.append(self.vocab.char.i2w[char_]) elif action == DELETE: # 1. Increment attention index try: encoder.pop() except IndexError as e: print(np.exp(log_probs.npvalue())) print('DELETE: ', action) elif action == END_WORD: # 1. Finish transduction break else: # one of the INSERT actions assert action in self.INSERTS # 1. Append inserted character to the output word char_ = self.vocab.act.i2w[action] word.append(char_) word = ''.join(word) return losses, word, action_history
regression_hidden_size = 300 #pretrained embeddings embedding_dim = emb_matrix_pretrained.shape[1] embedding_parameters = RNN_model.lookup_parameters_from_numpy(emb_matrix_pretrained) #add RNN unit fw_RNN_unit = dy.LSTMBuilder(num_layers, embedding_dim, hidden_size, RNN_model) bw_RNN_unit = dy.LSTMBuilder(num_layers, embedding_dim, hidden_size, RNN_model) second_fw_RNN_unit = dy.LSTMBuilder(num_layers, 2*hidden_size, hidden_size, RNN_model) second_bw_RNN_unit = dy.LSTMBuilder(num_layers, 2*hidden_size, hidden_size, RNN_model) pv1 = RNN_model.add_parameters( (regression_hidden_size, 2*hidden_size)) dy.parameter(pv1).npvalue().shape pb1 = RNN_model.add_parameters( (regression_hidden_size) ) dy.parameter(pb1).npvalue().shape pv2 = RNN_model.add_parameters( (regression_hidden_size)) dy.parameter(pv2).npvalue().shape pb2 = RNN_model.add_parameters(
def transduce(self, lemma, feats, oracle_actions=None, external_cg=True, sampling=False, unk_avg=True, debug_mode=False): def _valid_actions(encoder): valid_actions = list(self.INSERTS) if len(encoder) > 1: valid_actions += [STEP] else: valid_actions += [END_WORD] return valid_actions if not external_cg: dy.renew_cg() if oracle_actions: # reverse to enable simple popping oracle_actions = oracle_actions[::-1] oracle_actions.pop() # Deterministic insertion of BEGIN_WORD # vectorize lemma lemma_enc = self._build_lemma(lemma, unk_avg, is_training=bool(oracle_actions)) # vectorize features features = self._build_features(*feats) # add encoder and decoder to computation graph encoder = Encoder(self.fbuffRNN, self.bbuffRNN) decoder = self.wordRNN.initial_state() # add classifier to computation graph if self.MLP_DIM: # decoder output to hidden W_s2h = dy.parameter(self.pW_s2h) b_s2h = dy.parameter(self.pb_s2h) # hidden to action W_act = dy.parameter(self.pW_act) b_act = dy.parameter(self.pb_act) # encoder is a stack which pops lemma characters # and their representations from the top encoder.transduce(lemma_enc, lemma) action_history = [BEGIN_WORD] word = [] losses = [] count = 0 if debug_mode: print if oracle_actions: print action2string(oracle_actions, self.vocab) print lemma2string(lemma, self.vocab) while len(action_history) <= MAX_ACTION_SEQ_LEN: # what is at the top of encoder? encoder_embedding, char_enc = encoder.embedding(extra=True) if debug_mode: print 'Action history: ', action_history, action2string( action_history, self.vocab) print 'Encoder length: ', len(encoder) print 'Current char: ', char_enc, lemma2string([char_enc], self.vocab) print 'Word so far: ', u''.join(word) # decoder decoder_input = dy.concatenate([ encoder_embedding, features, self.ACT_LOOKUP[action_history[-1]] ]) decoder = decoder.add_input(decoder_input) decoder_output = decoder.output() # classifier if self.MLP_DIM: h = self.NONLIN(W_s2h * decoder_output + b_s2h) else: h = decoder_output valid_actions = _valid_actions(encoder) log_probs = dy.log_softmax(W_act * h + b_act, valid_actions) if oracle_actions is None: if sampling: dist = np.exp(log_probs.npvalue()) # sample according to softmax rand = np.random.rand() for action, p in enumerate(dist): rand -= p if rand <= 0: break else: action = np.argmax(log_probs.npvalue()) else: action = oracle_actions.pop() losses.append(dy.pick(log_probs, action)) action_history.append(action) if action == STEP: # Delete action encoder.pop() elif action == END_WORD: # Finish transduction break else: # Insert action assert action in self.INSERTS, (char_, action2string([char_], self.vocab), self.INSERTS) char_ = self.vocab.act.i2w[action] word.append(char_) word = u''.join(word) return losses, word, action_history
def transform(self, input_expr): W1 = dy.parameter(self.embeddings) b1 = dy.parameter(self.bias) return dy.affine_transform([b1, W1, input_expr])
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ Wq, Wk, Wv, Wo = [ dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo) ] bq, bk, bv, bo = [ dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo) ] # Start with a [(length, model_size) x batch] tensor x = expr_seq.as_transposed_tensor() x_len = x.dim()[0][0] x_batch = x.dim()[1] # Get the query key and value vectors # TODO: do we need bias broadcasting in DyNet? # q = dy.affine_transform([bq, x, Wq]) # k = dy.affine_transform([bk, x, Wk]) # v = dy.affine_transform([bv, x, Wv]) q = bq + x * Wq k = bk + x * Wk v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q, k, v) ] # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys attn_score = q * dy.transpose(k) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = dy.inputTensor(np.repeat( expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10 attn_score = attn_score + mask attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = dy.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch) # Final transformation # o = dy.affine_transform([bo, attn_prob * v, Wo]) o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask) self._final_states = [ transducers.FinalTransducerState(expr_seq[-1], None) ] return expr_seq
def __step_batch(self, batch): dy.renew_cg() W_s = dy.parameter(self.W_s) b_s = dy.parameter(self.b_s) W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W_m = dy.parameter(self.W_m) b_m = dy.parameter(self.b_m) W1_att_f = dy.parameter(self.W1_att_f) w2_att = dy.parameter(self.w2_att) src_batch = [x[0] for x in batch] tgt_batch = [x[1] for x in batch] batch_size = len(src_batch) attended_batch = [] for src_sent in src_batch: attended = [] c_t_sense = dy.vecInput(self.embed_size) sense_start = dy.concatenate([ self.lookup_frozen(self.src_lookup, self.src_token_to_id['<S>'][0]), dy.tanh(c_t_sense) ]) sense_state = self.sense_builder.initial_state().add_input( sense_start) for cw in src_sent: cw_sense_ids = self.src_token_to_id[cw] cw_senses = [ self.lookup_frozen(self.src_lookup, sense_id) for sense_id in cw_sense_ids ] h_senses = dy.concatenate_cols(cw_senses) h_m = sense_state.output() c_t_sense = self.__sense_attention_mlp(h_senses, h_m) sense_state = sense_state.add_input( dy.concatenate([c_t_sense, dy.tanh(c_t_sense)])) attended.append(c_t_sense) attended_batch.append(attended) attended_batch_rev = [list(reversed(sent)) for sent in attended_batch] # Encoder src_cws_l2r = [] src_cws_r2l = [] src_len = [len(sent) for sent in attended_batch] max_src_len = np.max(src_len) for i in range(max_src_len): src_cws_l2r.append([sent[i] for sent in attended_batch]) src_cws_r2l.append([sent[i] for sent in attended_batch_rev]) l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for i, (cws_l2r, cws_r2l) in enumerate(zip(src_cws_l2r, src_cws_r2l)): l2r_batch = dy.reshape(dy.concatenate_cols(cws_l2r), (self.embed_size, ), batch_size=batch_size) l2r_state = l2r_state.add_input(l2r_batch) r2l_batch = dy.reshape(dy.concatenate_cols(cws_r2l), (self.embed_size, ), batch_size=batch_size) r2l_state = r2l_state.add_input(r2l_batch) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) fixed_attentional_component = W1_att_f * h_fs_matrix losses = [] num_words = 0 # Decoder tgt_cws = [] tgt_len = [len(sent) for sent in tgt_batch] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([ self.tgt_token_to_id[sent[i]] if len(sent) > i else self.tgt_token_to_id['</S>'] for sent in tgt_batch ]) mask = [(1 if len(sent) > i else 0) for sent in tgt_batch] masks.append(mask) num_words += sum(mask) c_t = dy.vecInput(self.hidden_size * 2) start_state = dy.affine_transform([b_s, W_s, h_fs[-1]]) dec_state = self.word_dec_builder.initial_state().set_s( [start_state, dy.tanh(start_state)]) for i, (cws, nws, mask) in enumerate(zip(tgt_cws, tgt_cws[1:], masks)): embed_t = dy.lookup_batch(self.tgt_lookup, cws) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) h_e = dec_state.output() c_t = self.__word_attention_mlp(h_fs_matrix, h_e, fixed_attentional_component) m_t = dy.tanh( dy.affine_transform([b_m, W_m, dy.concatenate([h_e, c_t])])) y_star = dy.affine_transform([b_y, W_y, m_t]) loss = dy.pickneglogsoftmax_batch(y_star, nws) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1, ), len(batch)) mask_loss = loss * mask_expr losses.append(mask_loss) return dy.sum_batches(dy.esum(losses)), num_words
def translate_sentence(self, sent): dy.renew_cg() W_s = dy.parameter(self.W_s) b_s = dy.parameter(self.b_s) W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W_m = dy.parameter(self.W_m) b_m = dy.parameter(self.b_m) W1_att_f = dy.parameter(self.W1_att_f) w2_att = dy.parameter(self.w2_att) # Sense-level attention attended = [] c_t_sense = dy.vecInput(self.embed_size) sense_start = dy.concatenate([ self.lookup_frozen(self.src_lookup, self.src_token_to_id['<S>'][0]), c_t_sense ]) sense_state = self.sense_builder.initial_state().add_input(sense_start) for cw in sent: cw_sense_ids = self.src_token_to_id[cw] cw_senses = [ self.lookup_frozen(self.src_lookup, sense_id) for sense_id in cw_sense_ids ] h_senses = dy.concatenate_cols(cw_senses) h_m = sense_state.output() c_t_sense = self.__sense_attention_mlp(h_senses, h_m) sense_state = sense_state.add_input( dy.concatenate([c_t_sense, dy.tanh(c_t_sense)])) attended.append(c_t_sense) attended_rev = list(reversed(attended)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(attended, attended_rev): l2r_state = l2r_state.add_input(cw_l2r) r2l_state = r2l_state.add_input(cw_r2l) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) fixed_attentional_component = W1_att_f * h_fs_matrix # Decoder trans_sentence = ['<S>'] cw = trans_sentence[-1] c_t = dy.vecInput(self.hidden_size * 2) start_state = dy.affine_transform([b_s, W_s, h_fs[-1]]) dec_state = self.word_dec_builder.initial_state().set_s( [start_state, dy.tanh(start_state)]) while len(trans_sentence) < self.max_len: embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw]) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) h_e = dec_state.output() c_t = self.__word_attention_mlp(h_fs_matrix, h_e, fixed_attentional_component) m_t = dy.tanh( dy.affine_transform([b_m, W_m, dy.concatenate([h_e, c_t])])) y_star = dy.affine_transform([b_y, W_y, m_t]) p = dy.softmax(y_star) cw = self.tgt_id_to_token[np.argmax(p.vec_value())] if cw == '</S>': break trans_sentence.append(cw) return ' '.join(trans_sentence[1:])
def __step(self, instance): dy.renew_cg() W_s = dy.parameter(self.W_s) b_s = dy.parameter(self.b_s) W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W_m = dy.parameter(self.W_m) b_m = dy.parameter(self.b_m) W1_att_f = dy.parameter(self.W1_att_f) W1_att_e = dy.parameter(self.W1_att_e) w2_att = dy.parameter(self.w2_att) src_sent, tgt_sent = instance # Sense-level attention attended = [] c_t_sense = dy.vecInput(self.embed_size) sense_start = dy.concatenate([ self.lookup_frozen(self.src_lookup, self.src_token_to_id['<S>'][0]), dy.tanh(c_t_sense) ]) sense_state = self.sense_builder.initial_state().add_input(sense_start) for cw in src_sent: cw_sense_ids = self.src_token_to_id[cw] cw_senses = [ self.lookup_frozen(self.src_lookup, sense_id) for sense_id in cw_sense_ids ] h_senses = dy.concatenate_cols(cw_senses) h_m = sense_state.output() c_t_sense = self.__sense_attention_mlp(h_senses, h_m) sense_state = sense_state.add_input( dy.concatenate([c_t_sense, dy.tanh(c_t_sense)])) attended.append(c_t_sense) attended_rev = list(reversed(attended)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(attended, attended_rev): l2r_state = l2r_state.add_input(cw_l2r) r2l_state = r2l_state.add_input(cw_r2l) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) fixed_attentional_component = W1_att_f * h_fs_matrix losses = [] num_words = 0 # Decoder c_t = dy.vecInput(self.hidden_size * 2) start_state = dy.affine_transform([b_s, W_s, h_fs[-1]]) dec_state = self.word_dec_builder.initial_state().set_s( [start_state, dy.tanh(start_state)]) for (cw, nw) in zip(tgt_sent, tgt_sent[1:]): embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw]) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) h_e = dec_state.output() c_t = self.__word_attention_mlp(h_fs_matrix, h_e, fixed_attentional_component) m_t = dy.tanh( dy.affine_transform([b_m, W_m, dy.concatenate([h_e, c_t])])) y_star = dy.affine_transform([b_y, W_y, m_t]) loss = dy.pickneglogsoftmax(y_star, self.tgt_token_to_id[nw]) losses.append(loss) num_words += 1 return dy.esum(losses), num_words
def generate(self, s_sentence, max_len=150): dy.renew_cg() global beam_size W_y = dy.parameter(self.params["W_y"]) b_y = dy.parameter(self.params["b_y"]) s_lookup = self.params["s_lookup"] t_lookup = self.params["t_lookup"] s_sentence = [self.s_vocab[EOS]] + s_sentence + [self.s_vocab[EOS]] s_sentence_rev = list(reversed(s_sentence)) l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for cw_l2r in s_sentence: l2r_state = l2r_state.add_input(s_lookup[cw_l2r]) l2r_contexts.append(l2r_state.output()) for cw_r2l in s_sentence_rev: r2l_state = r2l_state.add_input(s_lookup[cw_r2l]) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() H_f = [] H_f = [dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts)] H_f_mat = dy.concatenate_cols(H_f) W1_att = dy.parameter(self.params["W1_att"]) w1dt = W1_att * H_f_mat c_t_init = dy.vecInput(2*self.HIDDEN_DIM) # c_t = dy.concatenate([l2r_contexts[-1], r2l_contexts[-1]]) dec_state_init = self.dec_builder.initial_state() possible_list = {("<EOS>", dec_state_init, c_t_init): 0.0} for i in range(len(s_sentence)*2): t_list = {} count_eos = 0 for (poss, dec_state, c_t), prob in possible_list.iteritems(): spl_poss = poss.split(' ') if i > 1 and spl_poss[-1] == "<EOS>": count_eos += 1 t_list[(poss, dec_state, c_t)] = prob continue embedding = t_lookup[self.t_vocab[spl_poss[-1]]] x_t = dy.concatenate([c_t, embedding]) dec_state = dec_state.add_input(x_t) c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_sentence), 1) probs = dy.softmax(W_y*dy.concatenate([c_t, dec_state.output()]) + b_y).vec_value() inds = self.list_nlargest(probs, beam_size) for ind in inds: sent = poss + " " + self.t_id_lookup[ind] sent_prob = prob + math.log(probs[ind]) # lp = (5 + len(sent.split()))/(5+1) # sent_prob = sent_prob/pow(lp, alpha) t_list[(sent, dec_state, c_t)] = sent_prob if count_eos == beam_size: break possible_list = {} for tup in self.dict_nlargest(t_list, beam_size): possible_list[tup] = t_list[tup] final_sent = self.dict_nlargest(possible_list, 1)[0][0] return " ".join(final_sent.replace("<EOS>", "").strip().split())
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.dim()[1] > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = {} c = {} h_list = [] batch_size = expr_seq.dim()[1] if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for i, cur_node_id in enumerate(lattice.graph.topo_sort()): prev_node = lattice.graph.predecessors(cur_node_id) val = expr_seq[i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(prev_node) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in prev_node) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in prev_node: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(prev_node) == 0: c[cur_node_id] = dy.cmult(i_it, i_gt) else: fc = dy.cmult(i_ft_list[0], c[prev_node[0]]) for i in range(1, len(prev_node)): fc += dy.cmult(i_ft_list[i], c[prev_node[i]]) c[cur_node_id] = fc + dy.cmult(i_it, i_gt) h_t = dy.cmult(i_ot, dy.tanh(c[cur_node_id])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h[cur_node_id] = h_t h_list.append(h_t) self._final_states = [ transducers.FinalTransducerState(h_list[-1], h_list[-1]) ] return expression_seqs.ExpressionSequence(expr_list=h_list)
def _decoder_input_embedding(self, rnn_state, previous_triple, encoded_string, enc_state, encoded_history, training=False, initial_state=None): attention_vecs = {} # Compute attention over encodded string. utterance_attn, utterance_dist = attend(encoded_string, rnn_state.h()[-1], dy.parameter(self._utterance_attention_w), self._dropout if training else 0.) attention_vecs['utterance'] = utterance_dist # Key for state and history attention. attn_key = dy.concatenate([utterance_attn, rnn_state.h()[-1]]) if training: attn_key = dy.dropout(attn_key, self._dropout) # Attend on history using current state and utterance attention. history_attn, history_dist = attend(encoded_history, attn_key, dy.parameter(self._history_attention_w), self._dropout if training else 0.) attention_vecs['history'] = history_dist # Attend on state. state_attn, state_dist = attend(enc_state, attn_key, dy.parameter(self._state_attention_w), self._dropout if training else 0.) state_attn2, state_dist2 = attend(enc_state, attn_key, dy.parameter(self._state_attention_w2), self._dropout if training else 0.) attention_vecs['state_1'] = state_dist attention_vecs['state_2'] = state_dist2 # Compute previous embedding prev_emb = self._embed_predicted_triple(previous_triple) # Concatenate with history and state, and mix with a feed-forward # layer. situated_embedding = dy.concatenate([utterance_attn, history_attn, state_attn, state_attn2, prev_emb]) # Attend on initial state (if provided) if self.args.feed_updated_state and self.args.always_initial_state: if not initial_state: raise ValueError("Encoding the initial state but it was not provided.") initial_attn, initial_dist = attend(initial_state, attn_key, dy.parameter(self._state_attention_winitial), self._dropout if training else 0.) initial_attn2, initial_dist2 = attend(initial_state, attn_key, dy.parameter(self._state_attention_winitial2), self._dropout if training else 0.) attention_vecs['initial_1'] = initial_dist attention_vecs['initial_2'] = initial_dist2 situated_embedding = dy.concatenate([situated_embedding, initial_attn, initial_attn2]) # Situated embedding mixing parameters. weights = dy.parameter(self._situated_w) biases = dy.parameter(self._situated_b) situated_embedding = dy.tanh(weights * situated_embedding + biases) return situated_embedding, attention_vecs
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) # get the outputs of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x, y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])] src_output = src_outputs[-1] # gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix # now decode all_losses = [] # Decoder # need to mask padding at end of sentence current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): # feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def embed_sent(self, sent_len): embeddings = dy.strided_select(dy.parameter(self.embeddings), [1, 1], [0, 0], [self.emb_dim, sent_len]) return expression_seqs.ExpressionSequence(expr_tensor=embeddings, mask=None)
def one_word_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, aligned_pair, feat_index, feature_types): pc.renew_cg() # read the parameters # char_lookup = model["char_lookup"] # feat_lookup = model["feat_lookup"] # R = pc.parameter(model["R"]) # bias = pc.parameter(model["bias"]) R = pc.parameter(R) bias = pc.parameter(bias) padded_lemma = BEGIN_WORD + lemma + END_WORD # convert characters to matching embeddings lemma_char_vecs = encode_lemma(alphabet_index, char_lookup, padded_lemma) # convert features to matching embeddings, if UNK handle properly feat_vecs = encode_feats(feat_index, feat_lookup, feats, feature_types) feats_input = pc.concatenate(feat_vecs) blstm_outputs = bilstm_transduce(encoder_frnn, encoder_rrnn, lemma_char_vecs) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] loss = [] # i is input index, j is output index i = 0 j = 0 # go through alignments, progress j when new output is introduced, progress i when new char is seen on lemma (no ~) aligned_lemma, aligned_word = aligned_pair aligned_lemma += END_WORD aligned_word += END_WORD # run through the alignments for align_index, (input_char, output_char) in enumerate(zip(aligned_lemma, aligned_word)): possible_outputs = [] # feedback, blstm[i], feats decoder_input = pc.concatenate([prev_output_vec, blstm_outputs[i], feats_input]) # if reached the end word symbol if output_char == END_WORD: s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss loss.append(-pc.log(pc.pick(probs, alphabet_index[END_WORD]))) continue # initially, if there is no prefix in the output (shouldn't delay on current input), step forward # TODO: check if can remove this condition entirely by adding '<' to both the aligned lemma/word if padded_lemma[i] == BEGIN_WORD and aligned_lemma[align_index] != ALIGN_SYMBOL: # perform rnn step s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss loss.append(-pc.log(pc.pick(probs, alphabet_index[STEP]))) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[STEP]] prev_char_vec = char_lookup[alphabet_index[EPSILON]] i += 1 # if 0-to-1 or 1-to-1 alignment, compute loss for predicting the output symbol if aligned_word[align_index] != ALIGN_SYMBOL: decoder_input = pc.concatenate([prev_output_vec, blstm_outputs[i], feats_input]) # feed new input to decoder s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) if aligned_word[align_index] in alphabet_index: current_loss = -pc.log(pc.pick(probs, alphabet_index[aligned_word[align_index]])) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[aligned_word[align_index]]] else: current_loss = -pc.log(pc.pick(probs, alphabet_index[UNK])) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[UNK]] loss.append(current_loss) j += 1 # if the input's not done and the next is not a 0-to-1 alignment, perform step if i < len(padded_lemma) - 1 and aligned_lemma[align_index + 1] != ALIGN_SYMBOL: # perform rnn step # feedback, blstm[i], feats decoder_input = pc.concatenate([prev_output_vec, blstm_outputs[i], feats_input]) s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss for the step action loss.append(-pc.log(pc.pick(probs, alphabet_index[STEP]))) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[STEP]] i += 1 # loss = esum(loss) loss = pc.average(loss) return loss
def calc_scores(words): dy.renew_cg() b_sm_exp = dy.parameter(b_sm) score = dy.esum([dy.lookup(W_sm, x) for x in words]) return score + b_sm_exp
def get_output(self, input, h_start=None, c_start=None): ''' apply the LSTM to a matrix or list of vectors input - a list of vectors of dimension d h_start - optional start state for continuation (default is to start at the beginning with h_0) c_start - optional start cell for continuation (default is to start at the beginning with c_0) ''' W_f = parameter(self.W_f) U_f = parameter(self.U_f) b_f = parameter(self.b_f) W_i = parameter(self.W_i) U_i = parameter(self.U_i) b_i = parameter(self.b_i) W_o = parameter(self.W_o) U_o = parameter(self.U_o) b_o = parameter(self.b_o) W_c = parameter(self.W_c) U_c = parameter(self.U_c) b_c = parameter(self.b_c) if h_start is None: h_0 = parameter(self.h_0) else: h_0 = h_start if c_start is None: c_0 = parameter(self.c_0) else: c_0 = c_start #IMPLEMENT YOUR LSTM CODE HERE result = [] hidden = [] cell = [] h_t = h_0 c_t = c_0 if not self.reverse: for i in range(0, len(input), 1): f_t = logistic(W_f * input[i] + U_f * h_t + b_f) i_t = logistic(W_i * input[i] + U_i * h_t + b_i) o_t = logistic(W_o * input[i] + U_o * h_t + b_o) c_t = cmult(f_t, c_t) + cmult( i_t, tanh(W_c * input[i] + U_c * h_t + b_c)) h_t = cmult(o_t, tanh(c_t)) hidden.append(h_t) cell.append(c_t) else: for i in range(len(input) - 1, -1, -1): f_t = logistic(W_f * input[i] + U_f * h_t + b_f) i_t = logistic(W_i * input[i] + U_i * h_t + b_i) o_t = logistic(W_o * input[i] + U_o * h_t + b_o) c_t = cmult(f_t, c_t) + cmult( i_t, tanh(W_c * input[i] + U_c * h_t + b_c)) h_t = cmult(o_t, tanh(c_t)) hidden.append(h_t) cell.append(c_t) result.append(hidden) result.append(cell) return result
def cal_scores(self, src_encodings, predict=False): src_len = len(src_encodings) src_encodings = dy.concatenate_cols( src_encodings) # src_ctx_dim, src_len, batch_size batch_size = src_encodings.dim()[1] W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] if predict: h_arc_head = self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings ])) # n_arc_ml_units, src_len, bs h_arc_dep = self.leaky_ReLu( dy.affine_transform( [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])) h_label_dep = self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings ])) else: src_encodings = dy.dropout_dim(src_encodings, 1, self.arc_mlp_dropout) h_arc_head = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings ])), 1, self.arc_mlp_dropout) # n_arc_ml_units, src_len, bs h_arc_dep = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings ])), 1, self.arc_mlp_dropout) h_label_head = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])), 1, self.label_mlp_dropout) h_label_dep = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings ])), 1, self.label_mlp_dropout) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def build_tagging_graph(self, words, ltags): # parameters -> expressions self.w1 = dy.parameter(self.W1) self.b1 = dy.parameter(self.B1) self.w2 = dy.parameter(self.W2) self.b2 = dy.parameter(self.B2) self.xw1 = dy.parameter(self.xW1) self.xb1 = dy.parameter(self.xB1) self.xw2 = dy.parameter(self.xW2) self.xb2 = dy.parameter(self.xB2) # apply dropout if self.eval: self.disable_dropout() else: self.enable_dropout() # initialize the RNNs f_init = self.fwdRNN.initial_state() b_init = self.bwdRNN.initial_state() f2_init = self.fwdRNN2.initial_state() b2_init = self.bwdRNN2.initial_state() self.hcf_init = self.hcfwdRNN.initial_state() self.hcb_init = self.hcbwdRNN.initial_state() self.ecf_init = self.ecfwdRNN.initial_state() self.ecb_init = self.ecbwdRNN.initial_state() xf_init = self.xfwdRNN.initial_state() xb_init = self.xbwdRNN.initial_state() xf2_init = self.xfwdRNN2.initial_state() xb2_init = self.xbwdRNN2.initial_state() self.xhcf_init = self.xhcfwdRNN.initial_state() self.xhcb_init = self.xhcbwdRNN.initial_state() self.xecf_init = self.xecfwdRNN.initial_state() self.xecb_init = self.xecbwdRNN.initial_state() # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. wembs = [self.word_rep(w, l) for w, l in zip(words, ltags)] cembs = [ self.char_rep(w, self.hcf_init, self.hcb_init, self.ecf_init, self.ecb_init, l) for w, l in zip(words, ltags) ] xembs = [dy.concatenate([w, c]) for w, c in zip(wembs, cembs)] # feed word vectors into biLSTM fw_exps = f_init.transduce(xembs) bw_exps = b_init.transduce(reversed(xembs)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] # feed word vectors into biLSTM fw_exps = f2_init.transduce(bi_exps) bw_exps = b2_init.transduce(reversed(bi_exps)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] # feed each biLSTM state to an MLP exps = [] pos_hidden = [] for xi in bi_exps: xh = self.w1 * xi #xh = self.meta.activation(xh) + self.b1 pos_hidden.append(xh) cembs = [ self.char_rep(w, self.xhcf_init, self.xhcb_init, self.xecf_init, self.xecb_init, l) for w, l in zip(words, ltags) ] xembs = [ dy.concatenate([w, c, p]) for w, c, p in zip(wembs, cembs, pos_hidden) ] xfw_exps = xf_init.transduce(xembs) xbw_exps = xb_init.transduce(reversed(xembs)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(xfw_exps, reversed(xbw_exps)) ] # feed word vectors into biLSTM fw_exps = xf2_init.transduce(bi_exps) bw_exps = xb2_init.transduce(reversed(bi_exps)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] exps = [] for xi in bi_exps: xh = self.xw1 * xi xh = self.meta.activation(xh) + self.xb1 xo = self.xw2 * xh + self.xb2 exps.append(xo) return exps
def predict_output_sequence(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, alphabet_index, inverse_alphabet_index, feat_index, feature_types): pc.renew_cg() # read the parameters # char_lookup = model["char_lookup"] # feat_lookup = model["feat_lookup"] # R = pc.parameter(model["R"]) # bias = pc.parameter(model["bias"]) R = pc.parameter(R) bias = pc.parameter(bias) # convert characters to matching embeddings, if UNK handle properly padded_lemma = BEGIN_WORD + lemma + END_WORD lemma_char_vecs = encode_lemma(alphabet_index, char_lookup, padded_lemma) # convert features to matching embeddings, if UNK handle properly feat_vecs = encode_feats(feat_index, feat_lookup, feats, feature_types) feats_input = pc.concatenate(feat_vecs) blstm_outputs = bilstm_transduce(encoder_frnn, encoder_rrnn, lemma_char_vecs) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] # i is input index, j is output index i = 0 num_outputs = 0 predicted_output_sequence = [] # run the decoder through the sequence and predict characters, twice max prediction as step outputs are added while num_outputs < MAX_PREDICTION_LEN * 3: # prepare input vector and perform LSTM step decoder_input = pc.concatenate([prev_output_vec, blstm_outputs[i], feats_input]) s = s.add_input(decoder_input) # compute softmax probs vector and predict with argmax decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) probs = probs.vec_value() predicted_output_index = common.argmax(probs) predicted_output = inverse_alphabet_index[predicted_output_index] predicted_output_sequence.append(predicted_output) # check if step or char output to promote i. if predicted_output == STEP: if i < len(padded_lemma) - 1: i += 1 num_outputs += 1 # check if reached end of word if predicted_output_sequence[-1] == END_WORD: break # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[predicted_output_index] # remove the end word symbol return u''.join(predicted_output_sequence[0:-1])
def initialize_graph_nodes(self): # convert parameters to expressions self.pad = dy.parameter(self.PAD) self.ps_W1 = dy.parameter(self.ps_pW1) self.ps_b1 = dy.parameter(self.ps_pb1) self.ps_W2 = dy.parameter(self.ps_pW2) self.ps_b2 = dy.parameter(self.ps_pb2) self.pr_W1 = dy.parameter(self.pr_pW1) self.pr_b1 = dy.parameter(self.pr_pb1) self.pr_W2 = dy.parameter(self.pr_pW2) self.pr_b2 = dy.parameter(self.pr_pb2) ####################################### self.xpad = dy.parameter(self.XPAD) self.xps_W1 = dy.parameter(self.xps_pW1) self.xps_b1 = dy.parameter(self.xps_pb1) self.xps_W2 = dy.parameter(self.xps_pW2) self.xps_b2 = dy.parameter(self.xps_pb2) self.xpr_W1 = dy.parameter(self.xpr_pW1) self.xpr_b1 = dy.parameter(self.xpr_pb1) self.xpr_W2 = dy.parameter(self.xpr_pW2) self.xpr_b2 = dy.parameter(self.xpr_pb2) # apply dropout if self.eval: self.disable_dropout() else: self.enable_dropout() # initialize the RNNs self.f_init = self.fwdRNN.initial_state() self.b_init = self.bwdRNN.initial_state() self.cf_init_eng = self.ecfwdRNN.initial_state() self.cb_init_eng = self.ecbwdRNN.initial_state() self.cf_init_bh = self.bhcfwdRNN.initial_state() self.cb_init_bh = self.bhcbwdRNN.initial_state() ################################################ self.xcf_init_eng = self.xecfwdRNN.initial_state() self.xcb_init_eng = self.xecbwdRNN.initial_state() self.xcf_init_bh = self.xbhcfwdRNN.initial_state() self.xcb_init_bh = self.xbhcbwdRNN.initial_state() self.ps_f_init = self.ps_fwdRNN.initial_state() self.ps_b_init = self.ps_bwdRNN.initial_state() ############################################### self.xps_f_init = self.xps_fwdRNN.initial_state() self.xps_b_init = self.xps_bwdRNN.initial_state() self.pr_f_init = self.pr_fwdRNN.initial_state() self.pr_b_init = self.pr_bwdRNN.initial_state() ############################################### self.xpr_f_init = self.xpr_fwdRNN.initial_state() self.xpr_b_init = self.xpr_bwdRNN.initial_state()
def translate_sentence_beam(self, sent): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) sent_rev = list(reversed(sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(sent, sent_rev): l2r_state = l2r_state.add_input(dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r])) r2l_state = r2l_state.add_input(dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l])) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) # Decoder cws = [] c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate([dy.lookup(self.tgt_lookup, self.tgt_token_to_id['<S>']), c_t]) dec_state = self.dec_builder.initial_state().add_input(start) sentence_dict = defaultdict(lambda : defaultdict(list)) end_sign = self.tgt_token_to_id['</S>'] #first_iter h_e = dec_state.output() c_t = self.__attention_mlp(h_fs_matrix, h_e) embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id['<S>']) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) y_star = (b_y + W_y * dec_state.output()) p = dy.log(dy.softmax(y_star)).npvalue() cws = np.argpartition(-p,self.beam_size)[:self.beam_size] #print p # print 'one',np.argmax(p),p[np.argmax(p)],self.tgt_id_to_token[np.argmax(p)] # print 'many',cws,p[cws],self.debug(cws) history_path = p[cws] # print 'history_path',history_path trans_iter = 0 for i in range(self.beam_size): sentence_dict[trans_iter][i] = [self.tgt_id_to_token[cws[i]]] try: index = np.where(cws==end_sign)[0][0] # print index,'sentence ends normaly.' return '' except : #print 'no </S>' pass #print 'first dict',sentence_dict while trans_iter < self.max_len: h_e = dec_state.output() c_t = self.__attention_mlp(h_fs_matrix, h_e) y_stars = np.zeros([self.tgt_vocab_size,self.beam_size]) #print 'before y_stars.shape',y_stars.shape embed_t = dy.lookup_batch(self.tgt_lookup, cws) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) #print 'element',(b_y + W_y * dec_state.output()).npvalue().shape y_stars = dy.log(dy.softmax(b_y + W_y * dec_state.output())).npvalue() #print 'y_stars.shape',y_stars.shape #print 'history_path',history_path.shape #print 'beam',self.beam_size,self.tgt_vocab_size #print 'y_stars_before',y_stars.shape,y_stars #print 'history_path',history_path.shape, history_path y_stars += history_path #print 'y_stars_after',y_stars.shape,y_stars y_star_all = np.reshape(y_stars.T,(self.beam_size * self.tgt_vocab_size,)) #print 'y_star_all',y_star_all.shape #print 'y_star_all_sort',np.sort(y_star_all) beam_indexes = np.argpartition(-y_star_all,self.beam_size)[:self.beam_size] beam_sentence_index = list(map( (lambda x: x/self.tgt_vocab_size),beam_indexes)) #end_sign = np.array([self.tgt_token_to_id['</S>'],self.tgt_token_to_id['</S>']+self.tgt_vocab_size,self.tgt_token_to_id['</S>']+2*self.tgt_vocab_size,self.tgt_token_to_id['</S>']+3*self.tgt_vocab_size]) cws = [index % self.tgt_vocab_size for index in beam_indexes] history_path = y_star_all[beam_indexes] # print 'beam_indexes ',beam_indexes, y_star_all[beam_indexes] #y_star_all[end_sign] # print 'beam_sentence_index',beam_sentence_index # print 'history_path',history_path # print 'cws words',cws,end_sign trans_iter += 1 #print trans_iter -1 ,'dict',sentence_dict max_score = -np.inf find_end_sign = False for i,cw in enumerate(cws): if cw == end_sign: find_end_sign = True if max_score < history_path[i]: max_score = history_path[i] max_index = i if find_end_sign: #print 'return',trans_iter-1,max_index, beam_sentence_index[max_index] return ' '.join(sentence_dict[trans_iter-1][beam_sentence_index[max_index]]) for i in range(self.beam_size): sentence_dict[trans_iter][i] = sentence_dict[trans_iter - 1][beam_sentence_index[i]] + [self.tgt_id_to_token[cws[i]]] #print trans_iter,'dict',sentence_dict #index = cws.index(self.tgt_token_to_id['</S>']) #print 'sentence too long' return ' '.join(sentence_dict[trans_iter][np.argmax(history_path)])
def _get_probs(self, rnn_output): output_w = dy.parameter(self.output_w) output_b = dy.parameter(self.output_b) probs = dy.softmax(output_w * rnn_output + output_b) return probs
# regular lookup a = lp[1].npvalue() b = lp[2].npvalue() c = lp[3].npvalue() # batch lookup instead of single elements. # two ways of doing this. abc1 = dy.lookup_batch(lp, [1,2,3]) print(abc1.npvalue()) abc2 = lp.batch([1,2,3]) print(abc2.npvalue()) print(np.hstack([a,b,c])) # use pick and pickneglogsoftmax in batch mode # (must be used in conjunction with lookup_batch): print("\nPick") W = dy.parameter( m.add_parameters((5, 10)) ) h = W * lp.batch([1,2,3]) print(h.npvalue()) print(dy.pick_batch(h,[1,2,3]).npvalue()) print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value()) # using pickneglogsoftmax_batch print("\nPick neg log softmax") print((-dy.log(dy.softmax(h))).npvalue()) print(dy.pickneglogsoftmax_batch(h,[1,2,3]).npvalue())
# number of layers in `RNN` num_layers = 1 #pretrained embeddings embedding_dim = emb_matrix_pretrained.shape[1] embedding_parameters = RNN_model.lookup_parameters_from_numpy( emb_matrix_pretrained) #add RNN unit RNN_unit = dy.VanillaLSTMBuilder(num_layers, embedding_dim, hidden_size, RNN_model) #add projection layer # W (hidden x num_labels) pW = RNN_model.add_parameters((hidden_size, len(list(l2i.keys())))) dy.parameter(pW).npvalue().shape # b (1 x num_labels) pb = RNN_model.add_parameters((len(list(l2i.keys())))) # note: we are just giving one dimension (ignoring the "1" dimension) # this makes manipulating the shapes in forward_pass() easier dy.parameter(pb).npvalue().shape RNN_model.populate("trained.model") batch_size = 256 num_batches_testing = int(np.ceil(len(test_tokens) / batch_size)) predictions = test() overall_accuracy, unknown_accuracy = evaluate(predictions, test_labels, unknown_index) endtime = datetime.datetime.now()
trainer = dy.AdamTrainer(model) trainer.set_clip_threshold(-1.0) trainer.set_sparse_updates(True if args.SPARSE == 1 else False) print("startup time: %r" % (time.time() - start)) sents = 0 all_time = 0 for ITER in range(100): random.shuffle(train) closs = 0.0 cwords = 0 start = time.time() batch = [] for i, tree in enumerate(train, 1): sents += 1 W = dy.parameter(W_) h, c = builder.expr_for_tree(tree, True) nodes = tree.nonterms() losses = [dy.pickneglogsoftmax(W * nt._e, l2i[nt.label]) for nt in nodes] loss = dy.esum(losses) batch.append(loss) if len(batch) == 50: loss = dy.esum(batch) closs += loss.value() cwords += len(nodes) loss.backward() trainer.update() batch = [] dy.renew_cg() if sents % 1000 == 0: trainer.status()
def beam_search_decode(self, lemma, feats, external_cg=True, unk_avg=True, beam_width=4): # Returns an expression of the loss for the sequence of actions. # (that is, the oracle_actions if present or the predicted sequence otherwise) def _valid_actions(encoder): valid_actions = [] if len(encoder) > 1: valid_actions += [COPY, DELETE] else: valid_actions += [END_WORD] valid_actions += self.INSERTS return valid_actions if not external_cg: dy.renew_cg() # vectorize lemma lemma_enc = self._build_lemma(lemma, unk_avg, is_training=False) # vectorize features features = self._build_features(*feats) # add encoder and decoder to computation graph encoder = Encoder(self.fbuffRNN, self.bbuffRNN) decoder = self.wordRNN.initial_state() # encoder is a stack which pops lemma characters and their # representations from the top. encoder.transduce(lemma_enc, lemma) # add classifier to computation graph if self.MLP_DIM: # decoder output to hidden W_s2h = dy.parameter(self.pW_s2h) b_s2h = dy.parameter(self.pb_s2h) # hidden to action W_act = dy.parameter(self.pW_act) b_act = dy.parameter(self.pb_act) encoder.pop() # BEGIN_WORD_CHAR # a list of tuples: # (decoder state, encoder state, list of previous actions, # log prob of previous actions, log prob of previous actions as dynet object, # word generated so far) beam = [(decoder, encoder, [COPY], 0., 0., [])] beam_length = 0 complete_hypotheses = [] while beam_length <= MAX_ACTION_SEQ_LEN: if not beam or beam_width == 0: break #if show_oracle_actions: # print 'Action: ', count, self.vocab.act.i2w[action_history[-1]] # print 'Encoder length, char: ', lemma, len(encoder), self.vocab.char.i2w[encoder.s[-1][-1]] # print 'word: ', u''.join(word) # print 'Remaining actions: ', oracle_actions, u''.join([self.vocab.act.i2w[a] for a in oracle_actions]) # count += 1 #elif action_history[-1] >= self.NUM_ACTS: # print 'Will be adding unseen act embedding: ', self.vocab.act.i2w[action_history[-1]] # compute probability of each of the actions and choose an action # either from the oracle or if there is no oracle, based on the model expansion = [] #print 'Beam length: ', beam_length for decoder, encoder, prev_actions, log_p, log_p_expr, word in beam: #print 'Expansion: ', action2string(prev_actions, self.vocab), log_p, ''.join(word) valid_actions = _valid_actions(encoder) # decoder decoder_input = dy.concatenate([encoder.embedding(), features, self.ACT_LOOKUP[prev_actions[-1]] ]) decoder = decoder.add_input(decoder_input) # classifier if self.double_feats: classifier_input = dy.concatenate([decoder.output(), features]) else: classifier_input = decoder.output() if self.MLP_DIM: h = self.NONLIN(W_s2h * classifier_input + b_s2h) else: h = classifier_input logits = W_act * h + b_act # logits_cpu = logits # dy.to_device(logits, 'CPU') log_probs_expr = dy.log_softmax(logits, valid_actions) log_probs = log_probs_expr.npvalue() top_actions = np.argsort(log_probs)[-beam_width:] #print 'top_actions: ', top_actions, action2string(top_actions, self.vocab) #print 'log_probs: ', log_probs #print prev_actions_int = list(int(x) for x in prev_actions) top_actions_int = list(int(x) for x in top_actions) expansion.extend(( (decoder, encoder.copy(), list(prev_actions_int), a, log_p + log_probs[a], log_p_expr + log_probs_expr[a], list(word)) for a in list(top_actions_int))) #print 'Overall, {} expansions'.format(len(expansion)) beam = [] expansion.sort(key=lambda e: e[4]) for e in expansion[-beam_width:]: decoder, encoder, prev_actions, action, log_p, log_p_expr, word = e prev_actions.append(action) # execute the action to update the transducer state if action == END_WORD: # 1. Finish transduction: # * beam width should be decremented # * expansion should be taken off the beam and # stored to final hypotheses set beam_width -= 1 complete_hypotheses.append((log_p, log_p_expr, ''.join(word), prev_actions)) else: if action == COPY: # 1. Increment attention index char_ = encoder.pop() # 2. Append copied character to the output word word.append(self.vocab.char.i2w[char_]) elif action == DELETE: # 1. Increment attention index encoder.pop() else: # one of the INSERT actions assert action in self.INSERTS # 1. Append inserted character to the output word char_ = self.vocab.act.i2w[action] word.append(char_) beam.append((decoder, encoder, prev_actions, log_p, log_p_expr, word)) beam_length += 1 if not complete_hypotheses: # nothing found because the model is so crappy complete_hypotheses = [(log_p, log_p_expr, ''.join(word), prev_actions) for _, _, prev_actions, log_p, log_p_expr, word in beam] complete_hypotheses.sort(key=lambda h: h[0], reverse=True) #print u'Complete hypotheses:' #for log_p, _, word, actions in complete_hypotheses: # print u'Actions {}, word {}, log p {:.3f}'.format(action2string(actions, self.vocab), word, log_p) return complete_hypotheses
network_nodes[n] = s return network_nodes, nodes[-1] m = dy.ParameterCollection() initial_values = [0.2, 0.8] p = {} for idx, val in enumerate(initial_values): p[idx] = m.add_parameters((1), init=dy.ConstInitializer(val)) trainer = dy.AdamTrainer(m, alpha=0.01) for i in range(1, 4): print("\nSTART of Epoch", i, "\n") counts = {} for idx in p: print("rule", idx, "prob:", dy.parameter(p[idx]).value()) counts[idx] = 0.0 print() for elem in corpus: hypergraph = build_hypergraph_rec(elem, {}) print("hypergraph for", elem, ":", hypergraph) network, output = build_network(p, hypergraph) loss = 1 * network[output] loss.backward() for n in range(1, output + 1): print("node", n, "value", network[n].value(), "gradient", network[n].gradient()) for r in p: rv = dy.parameter(p[r]) count = (rv.gradient() * rv.value() / loss.value())[0]