def attend(blstm_outputs, h_t, W_c, v_a, W__a, U__a): # iterate through input states to compute alphas # print 'computing scores...' # scores = [W_a * pc.concatenate([h_t, h_input]) for h_input in blstm_outputs] scores = [v_a * pc.tanh(W__a * h_t + U__a * h_input) for h_input in blstm_outputs] # print 'computed scores' # normalize to alphas using softmax # print 'computing alphas...' alphas = pc.softmax(pc.concatenate(scores)) # print 'computed alphas...' # compute c using alphas # print 'computing c...' # import time # s = time.time() # dim = len(blstm_outputs[0].vec_value()) # stacked_alphas = pc.concatenate_cols([alphas for j in xrange(dim)]) # stacked_vecs = pc.concatenate_cols([h_input for h_input in blstm_outputs]) # c = pc.esum(pc.cwise_multiply(stacked_vecs, stacked_alphas)) # print "stack time:", time.time() - s # s = time.time() c = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # print "pick time:", time.time() - s # print 'computed c' # print 'c len is {}'.format(len(c.vec_value())) # compute output state h~ using c and the decoder's h (global attention variation from Loung and Manning 2015) # print 'computing h~...' h_output = pc.tanh(W_c * pc.concatenate([h_t, c])) # print 'len of h_output is {}'.format(len(h_output.vec_value())) # print 'computed h~' return h_output, alphas, W__a.value()
def generate(in_seq, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): embedded = embed_sentence(in_seq) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 for i in range(len(in_seq)*2): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_char = probs.index(max(probs)) last_output_embeddings = output_lookup[next_char] if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def generate(input, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): def sample(probs): rnd = random.random() for i, p in enumerate(probs): rnd -= p if rnd <= 0: break return i embedded = embed_sentence(input) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 for i in range(len(input)*2): if count_EOS == 2: break vector = dy.concatenate([attend(encoded, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) probs = probs.vec_value() next_char = sample(probs) last_output_embeddings = output_lookup[next_char] if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def learn(self, batch_size): exps = self.memory.sample(batch_size) obss, actions, rewards, obs_nexts, dones = self._process(exps) # Update critic dy.renew_cg() target_actions = self.actor_target(obs_nexts, batched=True) target_values = self.critic_target(dy.concatenate([dy.inputTensor(obs_nexts, batched=True), target_actions]), batched=True) target_values = rewards + 0.99 * target_values.npvalue() * (1 - dones) dy.renew_cg() values = self.critic(np.concatenate([obss, actions]), batched=True) loss = dy.mean_batches((values - dy.inputTensor(target_values, batched=True)) ** 2) loss_value_critic = loss.npvalue() loss.backward() self.trainer_critic.update() # update actor dy.renew_cg() actions = self.actor(obss, batched=True) obs_and_actions = dy.concatenate([dy.inputTensor(obss, batched=True), actions]) loss = -dy.mean_batches(self.critic(obs_and_actions, batched=True)) loss_value_actor = loss.npvalue() loss.backward() self.trainer_actor.update() self.noise_stddev = ( self.noise_stddev - self.noise_stddev_decrease) if self.noise_stddev > self.noise_stddev_lower else self.noise_stddev_lower self.actor_target.update(self.actor, soft=True) self.critic_target.update(self.critic, soft=True) return loss_value_actor + loss_value_critic
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def generate(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent #get the output of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] attention_matrix = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) attention_matrix.append(alignment) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent, dy.concatenate_cols(attention_matrix).value()
def evaluate(self, inputs, train=False): """ Apply all MLP layers to concatenated input :param inputs: (key, vector) per feature type :param train: are we training now? :return: output vector of size self.output_dim """ input_keys, inputs = list(map(list, zip(*list(inputs)))) if self.input_keys: assert input_keys == self.input_keys, "Got: %s\nBut expected input keys: %s" % ( self.input_keys_str(self.input_keys), self.input_keys_str(input_keys)) else: self.input_keys = input_keys if self.gated: gates = self.params.get("gates") if gates is None: # FIXME attention weights should not be just parameters, but based on biaffine product? gates = self.params["gates"] = self.model.add_parameters((len(inputs), self.gated), init=dy.UniformInitializer(1)) input_dims = [i.dim()[0][0] for i in inputs] max_dim = max(input_dims) x = dy.concatenate_cols([dy.concatenate([i, dy.zeroes(max_dim - d)]) # Pad with zeros to get uniform dim if d < max_dim else i for i, d in zip(inputs, input_dims)]) * gates # Possibly multiple "attention heads" -- concatenate outputs to one vector inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1],))] x = dy.concatenate(inputs) assert len(x.dim()[0]) == 1, "Input should be a vector, but has dimension " + str(x.dim()[0]) dim = x.dim()[0][0] if self.input_dim: assert dim == self.input_dim, "Input dim mismatch: %d != %d" % (dim, self.input_dim) else: self.init_params(dim) self.config.print(self, level=4) if self.total_layers: if self.weights is None: self.weights = [[self.params[prefix + str(i)] for prefix in ("W", "b")] for i in range(self.total_layers)] if self.weights[0][0].dim()[0][1] < dim: # number of columns in W0 self.weights[0][0] = dy.concatenate_cols([self.weights[0][0], self.params["W0+"]]) for i, (W, b) in enumerate(self.weights): self.config.print(lambda: x.npvalue().tolist(), level=4) try: if train and self.dropout: x = dy.dropout(x, self.dropout) x = self.activation()(W * x + b) except ValueError as e: raise ValueError("Error in evaluating layer %d of %d" % (i + 1, self.total_layers)) from e self.config.print(lambda: x.npvalue().tolist(), level=4) return x
def calc_scores_with_previous_tag(words, referent_tags=None): """ Calculate scores using previous tag as input. If the referent tags are provided, then we will sample from previous referent tag or previous system prediction. :param words: :param referent_tags: :return: """ dy.renew_cg() word_embs = [LOOKUP[x] for x in words] # Transduce all batch elements for the backward LSTM, using the original word embeddings. bwd_init = bwdLSTM.initial_state() bwd_word_reps = bwd_init.transduce(reversed(word_embs)) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) scores = [] # Transduce one by one for the forward LSTM fwd_init = fwdLSTM.initial_state() s_fwd = fwd_init prev_tag = start_tag index = 0 for word, bwd_word_rep in zip(word_embs, reversed(bwd_word_reps)): # Concatenate word and tag representation just as training. fwd_input = dy.concatenate([word, TAG_LOOKUP[prev_tag]]) s_fwd = s_fwd.add_input(fwd_input) combined_rep = dy.concatenate([s_fwd.output(), bwd_word_rep]) score = dy.affine_transform([b, W, combined_rep]) prediction = np.argmax(score.npvalue()) if referent_tags: if sampler.sample_true(): prev_tag = referent_tags[index] else: prev_tag = prediction index += 1 else: prev_tag = prediction scores.append(score) return scores
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print ('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print (display_activations(words, activations)) print ('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print (' bias=%s' % bias) contributions = W * features print (' very bad (%.4f): %s' % (scores[0], contributions[0])) print (' bad (%.4f): %s' % (scores[1], contributions[1])) print (' neutral (%.4f): %s' % (scores[2], contributions[2])) print (' good (%.4f): %s' % (scores[3], contributions[3])) print ('very good (%.4f): %s' % (scores[4], contributions[4]))
def expr_for_tree(self, tree): if tree.isleaf(): return self.E[self.w2i.get(tree.label,0)] if len(tree.children) == 1: assert(tree.children[0].isleaf()) emb = self.expr_for_tree(tree.children[0]) Wi,Wo,Wu = [dy.parameter(w) for w in self.WS] bi,bo,bu,_ = [dy.parameter(b) for b in self.BS] i = dy.logistic(Wi*emb + bi) o = dy.logistic(Wo*emb + bo) u = dy.tanh( Wu*emb + bu) c = dy.cmult(i,u) expr = dy.cmult(o,dy.tanh(c)) return expr assert(len(tree.children) == 2),tree.children[0] e1 = self.expr_for_tree(tree.children[0]) e2 = self.expr_for_tree(tree.children[1]) Ui,Uo,Uu = [dy.parameter(u) for u in self.US] Uf1,Uf2 = [dy.parameter(u) for u in self.UFS] bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] e = dy.concatenate([e1,e2]) i = dy.logistic(Ui*e + bi) o = dy.logistic(Uo*e + bo) f1 = dy.logistic(Uf1*e1 + bf) f2 = dy.logistic(Uf2*e2 + bf) u = dy.tanh( Uu*e + bu) c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2) h = dy.cmult(o,dy.tanh(c)) expr = h return expr
def calc_score_of_history(words): # Lookup the embeddings and concatenate them emb = dy.concatenate([W_emb[x] for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def decode_loss(self, src_encodings, tgt_seqs): """ :param tgt_seqs: (tgt_heads, tgt_labels): list (length=batch_size) of (src_len) """ # todo(NOTE): Sentences should start with empty token (as root of dependency tree)! tgt_heads, tgt_labels = tgt_seqs src_len = len(tgt_heads[0]) batch_size = len(tgt_heads) np_tgt_heads = np.array(tgt_heads).flatten() # (src_len * batch_size) np_tgt_labels = np.array(tgt_labels).flatten() s_arc, s_label = self.cal_scores(src_encodings) # (src_len, src_len, bs), ([(src_len, src_len, bs)]) s_arc_value = s_arc.npvalue() s_arc_choice = np.argmax(s_arc_value, axis=0).transpose().flatten() # (src_len * batch_size) s_pick_labels = [dy.pick_batch(dy.reshape(score, (src_len,), batch_size=src_len * batch_size), s_arc_choice) for score in s_label] s_argmax_labels = dy.concatenate(s_pick_labels, d=0) # n_labels, src_len * batch_size reshape_s_arc = dy.reshape(s_arc, (src_len,), batch_size=src_len * batch_size) arc_loss = dy.pickneglogsoftmax_batch(reshape_s_arc, np_tgt_heads) label_loss = dy.pickneglogsoftmax_batch(s_argmax_labels, np_tgt_labels) loss = dy.sum_batches(arc_loss + label_loss) / batch_size return loss
def embed(self, batch_dict): all_embeddings_lists = [] for k, embedding in self.embeddings.items(): all_embeddings_lists.append(embedding.encode(batch_dict[k])) embedded = dy.concatenate(all_embeddings_lists, d=1) return embedded
def build_graph(self, features): # extract word and tags ids word_ids = [self.vocab.word2id(word_feat) for word_feat in features[0:20]] tag_ids = [self.vocab.tag2id(tag_feat) for tag_feat in features[20:40]] dep_ids = [self.vocab.dep2id(tag_feat) for tag_feat in features[40:]] # extract word embeddings and tag embeddings from features word_embeds = [self.word_embedding[wid] for wid in word_ids] tag_embeds = [self.tag_embedding[tid] for tid in tag_ids] dep_embeds = [self.dep_embedding[tid] for tid in dep_ids] # concatenating all features (recall that '+' for lists is equivalent to appending two lists) embedding_layer = dynet.concatenate(word_embeds + tag_embeds + dep_embeds) # calculating the hidden layer # .expr() converts a parameter to a matrix expression in dynet (its a dynet-specific syntax). hidden1 = self.transfer(self.hidden_layer1 * embedding_layer + self.hidden_layer_bias1) dropout1 = dynet.dropout(hidden1, 0.1) hidden2 = self.transfer(self.hidden_layer2 * dropout1 + self.hidden_layer_bias2) # To implement network without dropout, remove the line with dropout1 and change hidden2 to: # hidden2 = self.transfer(self.hidden_layer2 * hidden1 + self.hidden_layer_bias2) # calculating the output layer output = self.output_layer * hidden2 + self.output_bias # return the output as a dynet vector (expression) return output
def attend(input_vectors, state): global attention_w1 global attention_w2 global attention_v w1 = dy.parameter(attention_w1) w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) attention_weights = [] w2dt = w2*dy.concatenate(list(state.s())) for input_vector in input_vectors: attention_weight = v*dy.tanh(w1*input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors
def conv(input_, _=None): dims = tuple([1] + list(input_.dim()[0])) input_ = dy.reshape(input_, dims) mots = [] for conv in convs: mots.append(mot_pool(conv(input_))) return dy.concatenate(mots)
def embed(self, batch_dict): all_embeddings_lists = [] for k, embedding in self.embeddings.items(): all_embeddings_lists.append(embedding.encode(batch_dict[k], self.train)) embedded = dy.concatenate(all_embeddings_lists, d=1) embed_list = [self.dropout(e) for e in embedded] return embed_list
def word_rep(w, cf_init, cb_init): pad_char = vc.w2i['<*>'] char_ids = [pad_char] + [vc.w2i[c] for c in w] + [pad_char] char_embs = [CHARS_LOOKUP[cid] for cid in char_ids] fw_exps = cf_init.transduce(char_embs) bw_exps = cb_init.transduce(reversed(char_embs)) return dy.concatenate([ WORDS_LOOKUP[vw.w2i[w]], fw_exps[-1], bw_exps[-1] ])
def transduce(self, inputs, train): xs = inputs[:self.max_length] if not xs: return [] for i in range(self.lstm_layers): for n, d in ("f", 1), ("b", -1): Wr, br, Wh = [self.params["%s%d%s" % (p, i, n)] for p in ("Wr", "br", "Wh")] hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d]) hs = [hs_[0]] for t in range(1, len(hs_)): r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br) hs.append(dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t])) xs = hs if train: x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout) xs = [dy.pick(x, i, 1) for i in range(len(xs))] return xs
def calc_scores(words): dy.renew_cg() word_embs = [dy.lookup(W_emb, x) for x in words] fwd_init = fwdLSTM.initial_state() fwd_embs = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_embs = bwd_init.transduce(reversed(word_embs)) return W_sm * dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) + b_sm
def __call__(self, seq): """ seq is a list of vectors (either character embeddings or bilstm outputs) """ fw = self.lstmF.initial_state() bw = self.lstmB.initial_state() outf = fw.transduce(seq) outb = list(reversed(bw.transduce(reversed(seq)))) return [dy.concatenate([f, b]) for f, b in zip(outf, outb)]
def calc_loss(words, labels, heads): dy.renew_cg() word_embs = [dy.lookup(W_emb, x) for x in words] fwd_init = fwdLSTM.initial_state() fwd_embs = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_embs = bwd_init.transduce(reversed(word_embs)) src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))] return biaffineParser.decode_loss(src_encodings, ([heads], [labels]))
def encode_sentence(enc_fwd_lstm, enc_bwd_lstm, sentence): sentence_rev = list(reversed(sentence)) fwd_vectors = run_lstm(enc_fwd_lstm.initial_state(), sentence) bwd_vectors = run_lstm(enc_bwd_lstm.initial_state(), sentence_rev) bwd_vectors = list(reversed(bwd_vectors)) vectors = [dy.concatenate(list(p)) for p in zip(fwd_vectors, bwd_vectors)] return vectors
def __call__(self, embed_in, src_len, train=False, **kwargs): """Input Shape: ((T, H), B). Output Shape: [((H,), B)] * T""" embed_in = list(embed_in) self.dropout(train) forward, forward_state = rnn_forward_with_state(self.lstm_forward, embed_in, src_len) if self.lstm_backward is not None: backward, backward_state = rnn_forward_with_state(self.lstm_backward, embed_in) output = [dy.concatenate([f, b]) for f, b in zip(forward, backward)] hidden = [dy.concatenate([f, b]) for f, b in zip(forward_state, backward_state)] else: output = forward hidden = forward_state return RNNEncoderOutput( output=[o + e for o, e in zip(output, embed_in)] if self.residual else output, hidden=hidden, src_mask=self.src_mask_fn(src_len, len(output)) )
def calc_score_of_history(words, dropout=0.0): # Lookup the embeddings and concatenate them emb = dy.concatenate([W_emb[x] for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # CHANGE 2: perform dropout if dropout != 0.0: h = dy.dropout(h, dropout) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def calc_acc(words, labels, heads): dy.renew_cg() word_embs = [dy.lookup(W_emb, x) for x in words] fwd_init = fwdLSTM.initial_state() fwd_embs = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_embs = bwd_init.transduce(reversed(word_embs)) src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))] pred_heads, pred_labels = biaffineParser.decoding(src_encodings) return biaffineParser.cal_accuracy(pred_heads, pred_labels, heads, labels)
def _get_expr(self, sentence, i, j): # pylint: disable=missing-docstring if sentence[i].headfov is None: sentence[i].headfov = self.hid_layer_foh.expr() * concatenate( [sentence[i].lstms[0], sentence[i].lstms[1]]) if sentence[j].modfov is None: sentence[j].modfov = self.hid_layer_fom.expr() * concatenate( [sentence[j].lstms[0], sentence[j].lstms[1]]) if self.hidden2_units > 0: output = \ self.out_layer.expr() * self.activation( self.hid2_bias.expr() + self.hid2_layer.expr() * self.activation( sentence[i].headfov + sentence[j].modfov + self.hid_bias.expr())) # + self.outBias else: output = self.out_layer.expr() * self.activation( sentence[i].headfov + sentence[j].modfov + self.hid_bias.expr()) # + self.outBias return output
def _evaluate_label(self, sentence, i, j): # pylint: disable=missing-docstring if sentence[i].rheadfov is None: sentence[i].rheadfov = self.rhid_layer_foh.expr() * concatenate( [sentence[i].lstms[0], sentence[i].lstms[1]]) if sentence[j].rmodfov is None: sentence[j].rmodfov = self.rhid_layer_fom.expr() * concatenate( [sentence[j].lstms[0], sentence[j].lstms[1]]) if self.hidden2_units > 0: output = self.rout_layer.expr() * self.activation( self.rhid2_bias.expr() + self.rhid2_layer.expr() * self.activation(sentence[i].rheadfov + sentence[j].rmodfov + self.rhid_bias.expr())) + self.rout_bias.expr() else: output = self.rout_layer.expr() * self.activation( sentence[i].rheadfov + sentence[j].rmodfov + self.rhid_bias.expr()) + self.rout_bias.expr() return output.value(), output
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: vector = dy.concatenate([attend(vectors, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def generate(self, s_sentence, max_len=150): dy.renew_cg() W_y = dy.parameter(self.params["W_y"]) b_y = dy.parameter(self.params["b_y"]) s_lookup = self.params["s_lookup"] t_lookup = self.params["t_lookup"] s_sentence = [self.s_vocab[EOS]] + s_sentence + [self.s_vocab[EOS]] s_sentence_rev = list(reversed(s_sentence)) l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for cw_l2r in s_sentence: l2r_state = l2r_state.add_input(s_lookup[cw_l2r]) l2r_contexts.append(l2r_state.output()) for cw_r2l in s_sentence_rev: r2l_state = r2l_state.add_input(s_lookup[cw_r2l]) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() H_f = [] H_f = [ dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts) ] H_f_mat = dy.concatenate_cols(H_f) W1_att = dy.parameter(self.params["W1_att"]) w1dt = W1_att * H_f_mat c_t = dy.vecInput(2 * self.HIDDEN_DIM) embedding = t_lookup[self.t_vocab["<EOS>"]] dec_state = self.dec_builder.initial_state() t_sentence = [] count_eos = 0 for i in range(len(s_sentence) * 2): if count_eos == 2: break x_t = dy.concatenate([c_t, embedding]) dec_state = dec_state.add_input(x_t) c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_sentence), 1) probs = dy.softmax(W_y * dy.concatenate([c_t, dec_state.output()]) + b_y).vec_value() word = probs.index(max(probs)) embedding = t_lookup[word] if self.t_id_lookup[word] == "<EOS>": count_eos += 1 continue t_sentence.append(self.t_id_lookup[word]) return " ".join(t_sentence)
def _predict(self, src, dst=None, num_predictions=-1, runtime=True): # input x_list = self._make_input(src, True) # encoder for fw, bw, dropout in zip(self.encoder_fw, self.encoder_bw, self.config.encoder_layer_dropouts): if runtime: fw.set_dropouts(0, 0) bw.set_dropouts(0, 0) else: fw.set_dropouts(0, dropout) bw.set_dropouts(0, dropout) fw_list = fw.initial_state().transduce(x_list) bw_list = list( reversed(bw.initial_state().transduce(reversed(x_list)))) x_list = [ dy.concatenate([fw_value, bw_value]) for fw_value, bw_value in zip(fw_list, bw_list) ] # decoder predictions_left = num_predictions decoder = self.decoder.initial_state().add_input( dy.inputVector( [0] * (self.config.encoder_layers[-1] * 2 + self.config.input_size))) last_dst_we = self.special_we[0] softmax_output = [] aux_output = [] pred_index = 0 while predictions_left != 0: predictions_left -= 1 input = dy.concatenate( [self._attend(x_list, decoder), last_dst_we]) decoder = decoder.add_input(input) softmax = dy.softmax(self.output_softmax_w.expr() * decoder.output() + self.output_softmax_b.expr()) softmax_output.append(softmax) proj = dy.tanh(self.aux_layer_w.expr() * decoder.output() + self.aux_layer_b.expr()) aux = self.aux_layer_proj_w.expr( ) * proj + self.aux_layer_proj_b.expr() aux_output.append(aux) if runtime: out_we_index = np.argmax(softmax.npvalue()) if out_we_index == self.EOS: break last_dst_we = self.hol_we_dst[out_we_index] else: if pred_index < len(dst): last_word = dst[pred_index].word.decode('utf-8').lower() last_word_index = self.output_encodings.word2int['<UNK>'] if last_word in self.output_encodings.word2int: last_word_index = self.output_encodings.word2int[ last_word] last_dst_we = self.hol_we_dst[last_word_index] pred_index += 1 #failsafe if len(softmax_output) >= 2 * len(src): break return softmax_output, aux_output
def __call__(self, words_sequence, word2int, vocab, dataset="train"): lookup = self.params["lookup"] char_lstm = self.char_builder.initial_state() W_con = dy.parameter(self.params["W_con"]) b_con = dy.parameter(self.params["b_con"]) sequence = [] if dataset == "train": for word, label in words_sequence: char_embed = [] word_chars = list(word) # get char embeddings of words for ch in word_chars: char_embed.append(lookup[word2int.get(ch)]) # get char LSTM encoding char_encoder = char_lstm.transduce(char_embed)[-1] if word not in vocab: #curr_word_embed = dy.esum(char_embed) curr_word_embed = lookup[word2int.get("<UNK>")] else: curr_word_embed = lookup[word2int.get(word)] char_word_concat = dy.concatenate( [curr_word_embed, char_encoder]) sequence.append(W_con * char_word_concat + b_con) else: for word in words_sequence: char_embed = [] word_chars = list(word) # get char embeddings of words for ch in word_chars: char_embed.append(lookup[word2int.get(ch)]) # get char LSTM encoding char_encoder = char_lstm.transduce(char_embed)[-1] if word not in vocab: #curr_word_embed = dy.esum(char_embed) curr_word_embed = lookup[word2int.get("<UNK>")] else: curr_word_embed = lookup[word2int.get(word)] char_word_concat = dy.concatenate( [curr_word_embed, char_encoder]) sequence.append(W_con * char_word_concat + b_con) # convert the parameter into an Expession (add it to graph) W = dy.parameter(self.params["W"]) b = dy.parameter(self.params["b"]) fw_lstm1 = self.fw_builder1.initial_state() bw_lstm1 = self.bw_builder1.initial_state() fw_lstm2 = self.fw_builder2.initial_state() bw_lstm2 = self.bw_builder2.initial_state() # get output vectors of all time steps for the first bi-lstm fw_lstm1_output = fw_lstm1.transduce(sequence) bw_lstm1_output = bw_lstm1.transduce(reversed(sequence)) # concatenate backward vector to forward vector per each word bi1_output = [ dy.concatenate([fw1, bw1]) for fw1, bw1 in zip(fw_lstm1_output, reversed(bw_lstm1_output)) ] # get output vectors of all time steps for the second bi-lstm fw_lstm2_output = fw_lstm2.transduce(bi1_output) bw_lstm2_output = bw_lstm2.transduce(reversed(bi1_output)) # concatenate backward vector to forward vector per each 1st biLSTM vector bi2_output = [ dy.concatenate([fw2, bw2]) for fw2, bw2 in zip(fw_lstm2_output, reversed(bw_lstm2_output)) ] # calc net output net_output = [dy.softmax(W * out + b) for out in bi2_output] return net_output
def post_order_parse(self, words, oracle_actions, oracle_tokens, buffer, stack_top, action_top): stack = [] stack_symbol = [] output_actions = [] output_tokens = [] reduced = 0 nt_allowed = 1 ter_allowed = 1 act_allowed = 1 #recursively generate the tree until training data is exhausted while not (len(stack_symbol) == 1 and reduced != 0): valid_actions = [] if len(stack_symbol) == 0: valid_actions += [_ACT] if len(stack_symbol) >= 1: if act_allowed: valid_actions += [_ACT] if ter_allowed: valid_actions += [_TER] if nt_allowed: valid_actions += [_NT] word_weights = None action = valid_actions[0] #we make predictions when stack is not empty and _ACT is not the only valid action if len(stack_symbol) > 0: stack_embedding = stack[-1][0].output( ) if stack else self.initial_embedding() action_summary = action_top.output() word_weights = self.attention(stack_embedding, buffer) buffer_embedding = dy.esum([ vector * attterion_weight for vector, attterion_weight in zip(buffer, word_weights) ]) parser_state = dy.concatenate( [buffer_embedding, stack_embedding, action_summary]) h = self.mlp_layer(parser_state) if len(valid_actions) > 0: log_probs = dy.log_softmax(self.act_proj_layer(h), valid_actions) assert action in valid_actions, "action not in scope" action = max(enumerate(log_probs.vec_value()), key=itemgetter(1))[0] if action == _NT: #generate non-terminal log_probs_nt = dy.log_softmax(self.nt_proj_layer(h)) nt = max(enumerate(log_probs_nt.vec_value()), key=itemgetter(1))[0] stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) parent_rep = self.nt_input_layer(self.nt_lookup[nt]) found_start = 0 path_input = [] while found_start != 1: top_symbol = stack_symbol.pop() if top_symbol != '|': top = stack.pop() top_raw_rep, top_label, top_rep = top[2], top[1], top[ 0] path_input.append(top_raw_rep) else: found_start = 1 composed_rep = self.subtree_input_layer( dy.concatenate([dy.average(path_input), parent_rep])) stack_state = stack_state.add_input(composed_rep) stack.append((stack_state, 'c', composed_rep)) stack_symbol.append('c') reduced = 1 output_actions.append(self.act_vocab.token(action)) output_tokens.append(self.nt_vocab.token(nt)) elif action == _TER: #generate terminal log_probs_ter = dy.log_softmax(self.ter_proj_layer(h)) ter = max(enumerate(log_probs_ter.vec_value()), key=itemgetter(1))[0] stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) ter_embedding = self.ter_input_layer(self.ter_lookup[ter]) stack_state = stack_state.add_input(ter_embedding) stack.append((stack_state, 'c', ter_embedding)) stack_symbol.append('c') output_actions.append(self.act_vocab.token(action)) output_tokens.append(self.ter_vocab.token(ter)) else: #mark handle stack_symbol.append('|') output_actions.append(self.act_vocab.token(action)) action_embedding = self.act_input_layer(self.act_lookup[action]) action_top = action_top.add_input(action_embedding) count_c = stack_symbol.count('c') count_h = stack_symbol.count('|') nt_allowed = 1 if count_h == 0 or count_c == 0 or stack_symbol[-1] != 'c': nt_allowed = 0 act_allowed = 1 if count_c >= 10 or count_h > 10: act_allowed = 0 ter_allowed = 1 if count_c >= 10: ter_allowed = 0 return output_actions, output_tokens
def transduce( self, input_: str, encoded_input: List[int], target: Optional[str] = None, rollin: Optional[float] = None, external_cg: bool = True, ): """Runs the transducer for dynamic-oracle training and greedy decoding. Args: input_: Input string. encoded_input: List of integer character codes. target: Target string during training, `None` during prediction. external_cg: Whether an external computation graph is defined. rollin: The probability with which an action sampled from the model is executed. Used during training.""" if not external_cg: dy.renew_cg() is_training = bool(target) input_emb = self.input_embedding(encoded_input, is_training) bidirectional_emb = self.bidirectional_encoding(input_emb)[ 1:] # drop BEGIN_WORD input_length = len(bidirectional_emb) decoder = self.dec.initial_state() alignment = 0 action_history: List[int] = [BEGIN_WORD] output: List[str] = [] losses: List[dy.Expression] = [] log_p = 0.0 while len(action_history) <= MAX_ACTION_SEQ_LEN: length_encoder_suffix = input_length - alignment valid_actions = self.compute_valid_actions(length_encoder_suffix) input_char_embedding = bidirectional_emb[alignment] previous_action_embedding = self.act_lookup[action_history[-1]] decoder_input = dy.concatenate( [input_char_embedding, previous_action_embedding]) decoder = decoder.add_input(decoder_input) decoder_output = decoder.output() logits = self.pW * decoder_output + self.pb log_probs = dy.log_softmax(logits, valid_actions) log_probs_np = log_probs.npvalue() if target is None: # argmax decoding action = np.argmax(log_probs_np) else: # training with dynamic oracle # 1. ACTIONS TO MAXIMIZE optim_actions = self.expert_rollout(input_, target, alignment, output) loss = self.log_sum_softmax_loss(optim_actions, logits, valid_actions) # 2. ACTION SPACE EXPLORATION: NEXT ACTION if np.random.rand() <= rollin: # action is picked by sampling action = self.sample(log_probs_np) else: # action is picked from optim_actions # reinforce model beliefs by picking highest probability # action that is consistent with oracle action = optim_actions[int( np.argmax([log_probs_np[a] for a in optim_actions]))] losses.append(loss) log_p += log_probs_np[action] action_history.append(action) # execute the action to update the transducer state action = self.vocab.decode_action(action) if isinstance(action, ConditionalCopy): char_ = input_[alignment] alignment += 1 output.append(char_) elif isinstance(action, ConditionalDel): alignment += 1 elif isinstance(action, ConditionalIns): output.append(action.new) elif isinstance(action, ConditionalSub): alignment += 1 output.append(action.new) elif isinstance(action, EndOfSequence): break else: raise ValueError(f"Unknown action: {action}.") return Output(action_history, "".join(output), log_p, losses)
def build_tagging_graph1(words): # Create a new computation graph - clears the current one and starts a new one dy.renew_cg() # parameters -> expressions # Parameters are things need to be trained. # Initialize a parameter vector, and add the parameters to be part of the computation graph. # initialize the RNNs f_init = fwdRNN.initial_state() # forward b_init = bwdRNN.initial_state() # backword second_forward_initialize = secondfwdRNN.initial_state() second_backward_initialize = secondbwdRNN.initial_state() # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. wembs = [] # if the model is a - call the right function to get the match represtention if option == 'a': for i, w in enumerate(words): # convert word to an embbeding vector wembs.append(word_rep_1(w)) if option == 'c': for i, w in enumerate(words): word, pre, suff = word_rep_3(w) wembs.append(word + pre + suff) # """ feed word vectors into biLSTM transduce takes in a sequence of Expressions, and returns a sequence of Expressions """ # print wembs.__sizeof__() fw_exps = f_init.transduce(wembs) # forward bw_exps = b_init.transduce(reversed(wembs)) # backword """ biLSTM states Concatenate list of expressions to a single batched expression. All input expressions must have the same shape. """ # bi_exps = [dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))] bi_exps = [dy.concatenate([f, b]) for f, b in zip(fw_exps, bw_exps)] # print bi_exps.__sizeof__() # second BILSTM layer, input: b1,b2..bn, output: b'1,b'2, b'3.. forward_y_tag = second_forward_initialize.transduce(bi_exps) backward_y_tag = second_backward_initialize.transduce(reversed(bi_exps)) # concat the results b_tag = [ dy.concatenate([y1_tag, y2_tag]) for y1_tag, y2_tag in zip(forward_y_tag, backward_y_tag) ] # feed each biLSTM state to an MLP H = dy.parameter(pH) O = dy.parameter(pO) exps = [] for x in b_tag: r_t = O * (dy.tanh(H * x)) exps.append(r_t) return exps # results of model
def span_train(self, words, oracle_actions, oracle_tokens, options, buffer, stack_top, action_top): stack = [] losses = [] reduced = 0 nt_allowed = 1 found_root = 0 _root = self.nt_vocab[oracle_tokens[-1]] #recursively generate the tree until training data is exhausted while not (found_root): valid_actions = [] if len(stack) == 0: valid_actions += [_TER] if len(stack) >= 1: valid_actions += [_TER] if len(stack) >= 2: valid_actions += [_ACT] if len(stack) >= 1: valid_actions += [_NT] action = self.act_vocab[oracle_actions.pop(0)] #we make predictions when stack is not empty and _ACT is not the only valid action stack_embedding = stack[-1][0].output( ) if stack else self.initial_embedding() action_summary = action_top.output( ) if len(stack) > 0 else self.initial_embedding() word_weights = self.attention(stack_embedding, buffer) buffer_embedding = dy.esum([ vector * attterion_weight for vector, attterion_weight in zip(buffer, word_weights) ]) parser_state = dy.concatenate( [buffer_embedding, stack_embedding, action_summary]) h = self.mlp_layer(parser_state) if options.dropout > 0: h = dy.dropout(h, options.dropout) if len(valid_actions) > 0: log_probs = dy.log_softmax(self.act_proj_layer(h), valid_actions) assert action in valid_actions, "action not in scope" losses.append(-dy.pick(log_probs, action)) if action == _NT: #label span nt = self.nt_vocab[oracle_tokens.pop(0)] log_probs_nt = dy.log_softmax(self.nt_proj_layer(h)) losses.append(-dy.pick(log_probs_nt, nt)) if nt == _root: found_root = 1 stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) parent_rep = self.nt_input_layer(self.nt_lookup[nt]) top = stack.pop() top_raw_rep, top_label, top_rep = top[2], top[1], top[0] composed_rep = self.subtree_input_layer( dy.concatenate([top_raw_rep, parent_rep])) stack_state = stack_state.add_input(composed_rep) stack.append((stack_state, 'p', composed_rep)) reduced = 1 elif action == _TER: #generate terminal ter = self.ter_vocab[oracle_tokens.pop(0)] log_probs_ter = dy.log_softmax(self.ter_proj_layer(h)) losses.append(-dy.pick(log_probs_ter, ter)) stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) ter_embedding = self.ter_input_layer(self.ter_lookup[ter]) stack_state = stack_state.add_input(ter_embedding) stack.append((stack_state, 'c', ter_embedding)) else: #extend span assert len(stack) >= 2 top2 = stack.pop() top1 = stack.pop() top2_raw_rep = top2[2] top1_raw_rep = top1[2] span_rep = self.span_input_layer( dy.concatenate([top2_raw_rep, top1_raw_rep])) stack_state = stack_state.add_input(span_rep) stack.append((stack_state, 'c', span_rep)) action_embedding = self.act_input_layer(self.act_lookup[action]) action_top = action_top.add_input(action_embedding) return dy.esum(losses)
def _predict_arc(self, seq, runtime=True): x_list, encoder_states_list = self._make_input(seq, runtime) # BDLSTM rnn_outputs = [x_list] for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw, self.config.layer_dropouts): if runtime: fw.set_dropouts(0, 0) bw.set_dropouts(0, 0) else: fw.set_dropouts(dropout, dropout) bw.set_dropouts(dropout, dropout) fw_list = fw.initial_state().transduce(x_list) bw_list = list( reversed(bw.initial_state().transduce(reversed(x_list)))) x_list = [ dy.concatenate([x_fw, x_bw]) for x_fw, x_bw in zip(fw_list, bw_list) ] rnn_outputs.append(x_list) # projections arc_projections = [[ dy.tanh( self.proj_arc_w_dep.expr(update=True) * x + self.proj_arc_b_dep.expr(update=True)), dy.tanh( self.proj_arc_w_head.expr(update=True) * x + self.proj_arc_b_head.expr(update=True)) ] for x in rnn_outputs[-1]] label_projections = [[ dy.tanh( self.proj_label_w_dep.expr(update=True) * x + self.proj_label_b_dep.expr(update=True)), dy.tanh( self.proj_label_w_head.expr(update=True) * x + self.proj_label_b_head.expr(update=True)) ] for x in rnn_outputs[-1]] if not runtime: arc_projections = [[ dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout) ] for x1, x2 in arc_projections] label_projections = [[ dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout) ] for x1, x2 in label_projections] if not self.config.predict_morphology: aux_arc_projections = [[ dy.tanh( self.aux_proj_arc_w_dep.expr(update=True) * x + self.aux_proj_arc_b_dep.expr(update=True)), dy.tanh( self.aux_proj_arc_w_head.expr(update=True) * x + self.aux_proj_arc_b_head.expr(update=True)) ] for x in rnn_outputs[self.config.aux_softmax_layer]] if not runtime: aux_arc_projections = [[ dy.dropout(x1, self.config.presoftmax_mlp_dropout), dy.dropout(x2, self.config.presoftmax_mlp_dropout) ] for x1, x2 in aux_arc_projections] else: drp = self.config.presoftmax_mlp_dropout if runtime: drp = 0 upos_softmax = [ dy.softmax( self.upos_softmax_w.expr(update=True) * dy.dropout( dy.tanh( self.upos_proj_w.expr(update=True) * x + self.upos_proj_b.expr(update=True)), drp) + self.upos_softmax_b.expr(update=True)) for x in rnn_outputs[self.config.aux_softmax_layer] ] xpos_softmax = [ dy.softmax( self.xpos_softmax_w.expr(update=True) * dy.dropout( dy.tanh( self.xpos_proj_w.expr(update=True) * x + self.xpos_proj_b.expr(update=True)), drp) + self.xpos_softmax_b.expr(update=True)) for x in rnn_outputs[self.config.aux_softmax_layer] ] attrs_softmax = [ dy.softmax( self.attrs_softmax_w.expr(update=True) * dy.dropout( dy.tanh( self.attrs_proj_w.expr(update=True) * x + self.attrs_proj_b.expr(update=True)), drp) + self.attrs_softmax_b.expr(update=True)) for x in rnn_outputs[self.config.aux_softmax_layer] ] morphology_softmax = [ [upos, xpos, attrs] for upos, xpos, attrs in zip( upos_softmax, xpos_softmax, attrs_softmax) ] n = len(seq) + 1 arc_matrix = [[None] * n for _ in range(n)] if not self.config.predict_morphology: aux_arc_matrix = [[None] * n for _ in range(n)] for iDst in range(n): term_bias = self.link_b.expr( update=True) * arc_projections[iDst][1] term_weight = self.link_w.expr( update=True) * arc_projections[iDst][1] if not self.config.predict_morphology: aux_term_bias = self.aux_link_b.expr( update=True) * aux_arc_projections[iDst][1] aux_term_weight = self.aux_link_w.expr( update=True) * aux_arc_projections[iDst][1] for iSrc in range(n): if iSrc != iDst: attention = dy.reshape( term_weight, (1, self.config.arc_proj_size )) * arc_projections[iSrc][0] + term_bias arc_matrix[iSrc][iDst] = attention if not self.config.predict_morphology: aux_attention = dy.reshape(aux_term_weight, (1, self.config.arc_proj_size)) * \ aux_arc_projections[iSrc][0] + aux_term_bias aux_arc_matrix[iSrc][iDst] = aux_attention # compute softmax for arcs a_m = [[None] * n for _ in range(n)] if not self.config.predict_morphology: aux_a_m = [[None] * n for _ in range(n)] for iSrc in range(n): s_max = [] if not self.config.predict_morphology: aux_s_max = [] for iDst in range(n): if iSrc != iDst: s_max.append(arc_matrix[iSrc][iDst]) if not self.config.predict_morphology: aux_s_max.append(aux_arc_matrix[iSrc][iDst]) s_max = dy.softmax(dy.concatenate(s_max)) if not self.config.predict_morphology: aux_s_max = dy.softmax(dy.concatenate(aux_s_max)) ofs = 0 for iDst in range(n): if iSrc == iDst: ofs = -1 else: a_m[iSrc][iDst] = s_max[iDst + ofs] if not self.config.predict_morphology: aux_a_m[iSrc][iDst] = aux_s_max[iDst + ofs] if not self.config.predict_morphology: return a_m, aux_a_m, label_projections, None else: return a_m, None, label_projections, morphology_softmax[1:-1]
def calculate_batch_loss(self, batch): dy.renew_cg() W_y = dy.parameter(self.params["W_y"]) b_y = dy.parameter(self.params["b_y"]) s_lookup = self.params["s_lookup"] t_lookup = self.params["t_lookup"] s_batch = [x[0] for x in batch] t_batch = [x[1] for x in batch] wids = [] for i in range(len(s_batch[0])): wids.append([sent[i] for sent in s_batch]) wids_rev = list(reversed(wids)) l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for wid in wids: l2r_state = l2r_state.add_input(dy.lookup_batch(s_lookup, wid)) l2r_contexts.append(l2r_state.output()) for wid in wids_rev: r2l_state = r2l_state.add_input(dy.lookup_batch(s_lookup, wid)) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() losses = [] H_f = [] H_f = [ dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts) ] H_f_mat = dy.concatenate_cols(H_f) W1_att = dy.parameter(self.params["W1_att"]) w1dt = W1_att * H_f_mat t_wids = [] masks = [] for i in range(len(t_batch[0])): t_wids.append([(sent[i] if len(sent) > i else self.t_vocab[EOS]) for sent in t_batch]) mask = [(1 if len(sent) > i else 0) for sent in t_batch] masks.append(mask) c_t = dy.vecInput(2 * self.HIDDEN_DIM) words = [self.t_vocab[EOS]] * len(t_batch) embedding = dy.lookup_batch(t_lookup, words) dec_state = self.dec_builder.initial_state() for t_wid, mask in zip(t_wids, masks): x_t = dy.concatenate([c_t, embedding]) dec_state = dec_state.add_input(x_t) c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_batch[0]), len(wids[0])) probs = dy.affine_transform( [b_y, W_y, dy.concatenate([c_t, dec_state.output()])]) loss = dy.pickneglogsoftmax_batch(probs, t_wid) if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1, ), len(t_batch)) loss = loss * mask_expr losses.append(loss) embedding = dy.lookup_batch(t_lookup, t_wid) loss = dy.sum_batches(dy.esum(losses)) # /len(wids[0]) return loss
def _make_input(self, seq, runtime): x_list = [] encoder_states_list = [None] # add the root if not self.config.use_morphology: x_list.append(self.unknown_word_embedding[1]) elif not self.config.use_lexical: x_list.append(self.pad_tag_embedding[1]) else: # both lexical and morphology are used x_list.append( dy.concatenate([ self.unknown_word_embedding[1], self.pad_tag_embedding[1] ])) for entry in seq: word = entry.word if self.config.use_lexical: # prepare lexical embeddings char_emb, encoder_states = self.character_network.compute_embeddings( word, runtime=runtime) encoder_states_list.append(encoder_states) if sys.version_info[0] == 2: word_emb, found = self.embeddings.get_word_embeddings( word.decode('utf-8')) else: word_emb, found = self.embeddings.get_word_embeddings(word) if not found: word_emb = self.unknown_word_embedding[0] else: word_emb = dy.tanh( self.input_proj_w_word.expr(update=True) * dy.inputVector(word_emb) + self.input_proj_b_word.expr(update=True)) if sys.version_info[0] == 2: word = word.decode('utf-8').lower() else: word = word.lower() if word in self.encodings.word2int: holistic_emb = self.holistic_embeddings[ self.encodings.word2int[word]] else: holistic_emb = self.holistic_embeddings[ self.encodings.word2int['<UNK>']] # dropout lexical embeddings if runtime: w_emb = word_emb + char_emb + holistic_emb else: p1 = random.random() p2 = random.random() p3 = random.random() m1 = 1 m2 = 1 m3 = 1 if p1 < self.config.input_dropout_prob: m1 = 0 if p2 < self.config.input_dropout_prob: m2 = 0 if p3 < self.config.input_dropout_prob: m3 = 0 scale = 1.0 if m1 + m2 + m3 > 0: scale = float(3) / (m1 + m2 + m3) m1 = dy.scalarInput(m1) m2 = dy.scalarInput(m2) m3 = dy.scalarInput(m3) scale = dy.scalarInput(scale) w_emb = (word_emb * m1 + char_emb * m2 + holistic_emb * m3) * scale if self.config.use_morphology: if entry.upos in self.encodings.upos2int: upos_emb = self.upos_lookup[self.encodings.upos2int[ entry.upos]] else: upos_emb = dy.inputVector( [0] * self.config.input_embeddings_size) if entry.xpos in self.encodings.xpos2int: xpos_emb = self.xpos_lookup[self.encodings.xpos2int[ entry.xpos]] else: xpos_emb = dy.inputVector( [0] * self.config.input_embeddings_size) if entry.attrs in self.encodings.attrs2int: attrs_emb = self.attrs_lookup[self.encodings.attrs2int[ entry.attrs]] else: attrs_emb = dy.inputVector( [0] * self.config.input_embeddings_size) # overwrite all dropouts. it will later be handled by "same-mask" t_emb = upos_emb + xpos_emb + attrs_emb # w_emb = word_emb + char_emb + holistic_emb # compose embeddings, if necessary if self.config.use_lexical and self.config.use_morphology: if not runtime: p1 = random.random() p2 = random.random() m1 = 1 m2 = 1 if p1 < self.config.input_dropout_prob: m1 = 0 if p2 < self.config.input_dropout_prob: m2 = 0 if m1 + m2 > 0: scale = float(2.0) / (m1 + m2) else: scale = 1.0 scale = dy.scalarInput(scale) m1 = dy.scalarInput(m1) m2 = dy.scalarInput(m2) x_list.append( dy.concatenate( [w_emb * m1 * scale, t_emb * m2 * scale])) else: x_list.append(dy.concatenate([w_emb, t_emb])) elif self.config.use_lexical: # just use_lexical == True x_list.append(w_emb) else: # just use_morphology == True x_list.append(t_emb) # close sequence if not self.config.use_morphology: x_list.append(self.unknown_word_embedding[2]) elif not self.config.use_lexical: x_list.append(self.pad_tag_embedding[2]) else: x_list.append( dy.concatenate([ self.unknown_word_embedding[2], self.pad_tag_embedding[2] ])) encoder_states_list.append(None) return x_list, encoder_states_list
def _predict(self, seq, runtime=True): softmax_list = [] aux_softmax_list = [] x_list = [] for entry in seq: word = entry.word char_emb, _ = self.character_network.compute_embeddings( word, runtime=runtime) word_emb, found = self.embeddings.get_word_embeddings( word.decode('utf-8')) if not found: word_emb = self.unknown_word_embedding[0] else: word_emb = dy.inputVector(word_emb) holistic_word = word.decode('utf-8').lower() if holistic_word in self.encodings.word2int: hol_emb = self.holistic_word_embedding[ self.encodings.word2int[holistic_word]] else: hol_emb = self.holistic_word_embedding[ self.encodings.word2int['<UNK>']] proj_emb = self.emb_proj_w.expr() * word_emb proj_hol = self.hol_proj_w.expr() * hol_emb proj_char = self.char_proj_w.expr() * char_emb # x_list.append(dy.tanh(proj_char + proj_emb + proj_hol)) if runtime: x_list.append(dy.tanh(proj_char + proj_emb + proj_hol)) else: p1 = random.random() p2 = random.random() p3 = random.random() m1 = 1 m2 = 1 m3 = 1 if p1 < self.config.input_dropout_prob: m1 = 0 if p2 < self.config.input_dropout_prob: m2 = 0 if p3 < self.config.input_dropout_prob: m3 = 0 scale = 1.0 if m1 + m2 + m3 > 0: scale = float(3) / (m1 + m2 + m3) m1 = dy.scalarInput(m1) m2 = dy.scalarInput(m2) m3 = dy.scalarInput(m3) scale = dy.scalarInput(scale) x_list.append( dy.tanh((proj_char * m1 + proj_emb * m2 + proj_hol * m3) * scale)) # BDLSTM rnn_outputs = [] for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw, self.config.layer_dropouts): if not runtime: fw.set_dropouts(0, dropout) bw.set_dropouts(0, dropout) else: fw.set_dropouts(0, 0) bw.set_dropouts(0, 0) fw_list = fw.initial_state().transduce(x_list) bw_list = list( reversed(bw.initial_state().transduce(reversed(x_list)))) x_list = [ dy.concatenate([x_fw, x_bw]) for x_fw, x_bw in zip(fw_list, bw_list) ] # if runtime: # x_out = x_list # else: # x_out = [dy.dropout(x, dropout) for x in x_list] rnn_outputs.append(x_list) # SOFTMAX mlp_output = [] for x in rnn_outputs[-1]: pre_softmax = [] for iMLP in xrange(3): mlp_w = self.mlps[iMLP][0] mlp_b = self.mlps[iMLP][1] inp = x for w, b, drop, in zip(mlp_w, mlp_b, self.config.presoftmax_mlp_dropouts): inp = dy.tanh(w.expr() * inp + b.expr()) if not runtime: inp = dy.dropout(inp, drop) pre_softmax.append(inp) mlp_output.append(pre_softmax) for softmax_inp, aux_softmax_inp in zip( mlp_output, rnn_outputs[self.config.aux_softmax_layer - 1]): softmax_list.append([ dy.softmax(self.softmax_upos_w.expr() * softmax_inp[0] + self.softmax_upos_b.expr()), dy.softmax(self.softmax_xpos_w.expr() * softmax_inp[1] + self.softmax_xpos_b.expr()), dy.softmax(self.softmax_attrs_w.expr() * softmax_inp[2] + self.softmax_attrs_b.expr()) ]) aux_softmax_list.append([ dy.softmax(self.aux_softmax_upos_w.expr() * aux_softmax_inp + self.aux_softmax_upos_b.expr()), dy.softmax(self.aux_softmax_xpos_w.expr() * aux_softmax_inp + self.aux_softmax_xpos_b.expr()), dy.softmax(self.aux_softmax_attrs_w.expr() * aux_softmax_inp + self.aux_softmax_attrs_b.expr()) ]) return softmax_list, aux_softmax_list
def _feed_input(dst_embed_i, attn_output_i): return dy.concatenate([dst_embed_i, attn_output_i])
def pre_order_train(self, words, oracle_actions, oracle_tokens, options, buffer, stack_top, action_top): stack = [] losses = [] reducable = 0 reduced = 0 #recursively generate the tree until training data is exhausted while not (len(stack) == 1 and reduced != 0): valid_actions = [] if len(stack) == 0: valid_actions += [_NT] if len(stack) >= 1: valid_actions += [_TER, _NT] if len(stack) >= 2 and reducable != 0: valid_actions += [_ACT] action = self.act_vocab[oracle_actions.pop(0)] word_weights = None #we make predictions when stack is not empty and _ACT is not the only valid action if len(stack) > 0 and valid_actions[0] != _ACT: stack_embedding = stack[-1][0].output() action_summary = action_top.output() word_weights = self.attention(stack_embedding, buffer) buffer_embedding = dy.esum([ vector * attterion_weight for vector, attterion_weight in zip(buffer, word_weights) ]) for i in range(len(stack)): if stack[len(stack) - 1 - i][1] == 'p': parent_embedding = stack[len(stack) - 1 - i][2] break parser_state = dy.concatenate([ buffer_embedding, stack_embedding, parent_embedding, action_summary ]) h = self.mlp_layer(parser_state) if options.dropout > 0: h = dy.dropout(h, options.dropout) if len(valid_actions) > 0: log_probs = dy.log_softmax(self.act_proj_layer(h), valid_actions) assert action in valid_actions, "action not in scope" losses.append(-dy.pick(log_probs, action)) if action == _NT: #generate non-terminal nt = self.nt_vocab[oracle_tokens.pop(0)] #no need to predict the ROOT (assumed ROOT is fixed) if word_weights is not None: log_probs_nt = dy.log_softmax(self.nt_proj_layer(h)) losses.append(-dy.pick(log_probs_nt, nt)) stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) nt_embedding = self.nt_input_layer(self.nt_lookup[nt]) stack_state = stack_state.add_input(nt_embedding) stack.append((stack_state, 'p', nt_embedding)) elif action == _TER: #generate terminal ter = self.ter_vocab[oracle_tokens.pop(0)] log_probs_ter = dy.log_softmax(self.ter_proj_layer(h)) losses.append(-dy.pick(log_probs_ter, ter)) stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) ter_embedding = self.ter_input_layer(self.ter_lookup[ter]) stack_state = stack_state.add_input(ter_embedding) stack.append((stack_state, 'c', ter_embedding)) else: #subtree completion found_p = 0 path_input = [] #keep popping until the parent is found while found_p != 1: top = stack.pop() top_raw_rep, top_label, top_rep = top[2], top[1], top[0] path_input.append(top_raw_rep) if top_label == 'p': found_p = 1 parent_rep = path_input.pop() composed_rep = self.subtree_input_layer( dy.concatenate([dy.average(path_input), parent_rep])) stack_state, _, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) stack_state = stack_state.add_input(composed_rep) stack.append((stack_state, 'c', composed_rep)) reduced = 1 action_embedding = self.act_input_layer(self.act_lookup[action]) action_top = action_top.add_input(action_embedding) reducable = 1 #cannot reduce after an NT if stack[-1][1] == 'p': reducable = 0 return dy.esum(losses)
def static_train(self,\ train_treebank,\ validation_treebank,\ lr=0.001,\ hidden_dropout=0.01,\ batch_size=64,\ max_epochs=200,\ max_lexicon_size=9998,\ glove_file=None): """ Locally trains a model with a static oracle and a multi-task standard feedforward NN. @param train_treebank : a list of dependency trees @param validation_treebank : a list of dependency trees @param lr : learning rate @param hidden_dropout : dropout on hidden layer @param batch_size : size of mini batches @param max_epochs : max number of epochs @param max_lexicon_size : max number of entries in the lexicon @param glove_file : file where to find pre-trained word embeddings """ print("Encoding dataset from %d trees."%len(train_treebank)) #(1) build dictionaries self.code_symbols(train_treebank,lexicon_size = max_lexicon_size) #(2) encode data sets lex_train_gen , struct_train_gen = self.make_data_generators(train_treebank,batch_size) lex_dev_gen , struct_dev_gen = self.make_data_generators(validation_treebank,batch_size) print(self,flush=True) print("epochs %d\nstructural training examples [N] = %d\nlexical training examples [N] = %d\nBatch size = %d\nDropout = %f\nlearning rate = %f"%(max_epochs,struct_train_gen.N,lex_train_gen.N,batch_size,hidden_dropout,lr),flush=True) #(3) make network self.model = dy.ParameterCollection() self.hidden_weights = self.model.add_parameters((self.hidden_size,self.embedding_size*self.input_length)) self.action_weights = self.model.add_parameters((self.actions_size,self.hidden_size)) if glove_file is None: self.input_embeddings = self.model.add_parameters((self.lexicon_size,self.embedding_size)) else: self.input_embeddings = self.model.parameters_from_numpy(self.read_glove_embeddings(glove_file)) if not self.tied: self.output_embeddings = self.model.add_parameters((self.lexicon_size,self.hidden_size)) #(4) fitting lex_gen = lex_train_gen.next_batch() struct_gen = struct_train_gen.next_batch() max_batches = max( lex_train_gen.get_num_batches(), struct_train_gen.get_num_batches() ) print(lex_train_gen.get_num_batches(), struct_train_gen.get_num_batches(),flush=True) lex_valid_gen = lex_dev_gen.next_batch() struct_valid_gen = struct_dev_gen.next_batch() min_nll = float('inf') trainer = dy.AdamTrainer(self.model,alpha=lr) history_log = [] for e in range(max_epochs): struct_loss,lex_loss = 0,0 struct_N,lex_N = 0,0 start_t = time.time() for b in range(max_batches): #struct X_struct,Y_struct = next(struct_gen) #question of proportions : should struct and lex be evenly sampled or not (??): #here the parity oversamples approx twice the lexical actions dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) A = dy.parameter(self.action_weights) batched_X = zip(*X_struct) #transposes the X matrix lookups = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(lookups) ybatch_preds = dy.pickneglogsoftmax_batch(A * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_struct) loss = dy.sum_batches(ybatch_preds) struct_N += len(Y_struct) struct_loss += loss.value() loss.backward() trainer.update() #lex X_lex,Y_lex = next(lex_gen) if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) batched_X = zip(*X_lex) #transposes the X matrix lookups = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(lookups) ybatch_preds = dy.pickneglogsoftmax_batch(E * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_lex) loss = dy.sum_batches(ybatch_preds) else: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) O = dy.parameter(self.output_embeddings) batched_X = zip(*X_lex) #transposes the X matrix lookups = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(lookups) ybatch_preds = dy.pickneglogsoftmax_batch(O * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_lex) loss = dy.sum_batches(ybatch_preds) lex_N += len(Y_lex) lex_loss += loss.value() loss.backward() trainer.update() end_t = time.time() # (5) validation X_lex_valid,Y_lex_valid = lex_dev_gen.batch_all() lex_valid_nll = -sum(self.predict_logprobs(X_lex_valid,Y_lex_valid,structural=False)) X_struct_valid,Y_struct_valid = struct_dev_gen.batch_all() struct_valid_nll = -sum(self.predict_logprobs(X_struct_valid,Y_struct_valid,structural=True)) history_log.append((e,end_t-start_t,\ exp(lex_loss/lex_N),\ exp(struct_loss/struct_N),\ exp(lex_valid_nll/lex_dev_gen.N),\ exp(struct_valid_nll/struct_dev_gen.N),\ exp((lex_valid_nll+struct_valid_nll) /(struct_dev_gen.N+lex_dev_gen.N)))) print('Epoch %d (%.2f sec.) TRAIN:: PPL_lex = %f, PPL_struct = %f / VALID:: PPL_lex = %f, PPL_struct = %f, PPL_all = %f'%tuple(history_log[-1]),flush=True) if lex_valid_nll+struct_valid_nll < min_nll: df = pd.DataFrame(history_log,columns=['epoch','wall_time','ppl_lex_train','ppl_struct_train','ppl_lex_valid','ppl_struct_valid','ppl_all_valid']) self.save_model('best_model_dump',epoch = e, learning_curve=df) return pd.DataFrame(history_log,columns=['epoch','wall_time','ppl_lex_train','ppl_struct_train','ppl_lex_valid','ppl_struct_valid','ppl_all_valid'])
def post_order_train(self, words, oracle_actions, oracle_tokens, options, buffer, stack_top, action_top): stack = [] losses = [] stack_symbol = [] reduced = 0 nt_allowed = 1 #recursively generate the tree until training data is exhausted while not (len(stack_symbol) == 1 and reduced != 0): valid_actions = [] if len(stack_symbol) == 0: valid_actions += [_ACT] if len(stack_symbol) >= 1: valid_actions += [_TER, _ACT] if len(stack) >= 1 and nt_allowed: valid_actions += [_NT] action = self.act_vocab[oracle_actions.pop(0)] word_weights = None #we make predictions when stack is not empty and _ACT is not the only valid action if len(stack_symbol) > 0: stack_embedding = stack[-1][0].output( ) if stack else self.initial_embedding() action_summary = action_top.output() word_weights = self.attention(stack_embedding, buffer) buffer_embedding = dy.esum([ vector * attterion_weight for vector, attterion_weight in zip(buffer, word_weights) ]) parser_state = dy.concatenate( [buffer_embedding, stack_embedding, action_summary]) h = self.mlp_layer(parser_state) if options.dropout > 0: h = dy.dropout(h, options.dropout) if len(valid_actions) > 0: log_probs = dy.log_softmax(self.act_proj_layer(h), valid_actions) assert action in valid_actions, "action not in scope" losses.append(-dy.pick(log_probs, action)) if action == _NT: #generate non-terminal nt = self.nt_vocab[oracle_tokens.pop(0)] log_probs_nt = dy.log_softmax(self.nt_proj_layer(h)) losses.append(-dy.pick(log_probs_nt, nt)) stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) parent_rep = self.nt_input_layer(self.nt_lookup[nt]) found_start = 0 path_input = [] while found_start != 1: top_symbol = stack_symbol.pop() if top_symbol != '|': top = stack.pop() top_raw_rep, top_label, top_rep = top[2], top[1], top[ 0] path_input.append(top_raw_rep) else: found_start = 1 composed_rep = self.subtree_input_layer( dy.concatenate([dy.average(path_input), parent_rep])) stack_state = stack_state.add_input(composed_rep) stack.append((stack_state, 'c', composed_rep)) stack_symbol.append('c') reduced = 1 elif action == _TER: #generate terminal ter = self.ter_vocab[oracle_tokens.pop(0)] log_probs_ter = dy.log_softmax(self.ter_proj_layer(h)) losses.append(-dy.pick(log_probs_ter, ter)) stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) ter_embedding = self.ter_input_layer(self.ter_lookup[ter]) stack_state = stack_state.add_input(ter_embedding) stack.append((stack_state, 'c', ter_embedding)) stack_symbol.append('c') else: #mark handle stack_symbol.append('|') action_embedding = self.act_input_layer(self.act_lookup[action]) action_top = action_top.add_input(action_embedding) nt_allowed = 1 if stack_symbol.count('|') == 0: nt_allowed = 0 return dy.esum(losses)
def generate(self, pre_context, pos_context, entity): embedded = self.embed_sentence(pre_context) pre_encoded = self.encode_sentence(self.encpre_fwd_lstm, self.encpre_bwd_lstm, embedded) embedded = self.embed_sentence(pos_context) pos_encoded = self.encode_sentence(self.encpos_fwd_lstm, self.encpos_bwd_lstm, embedded) w = dy.parameter(self.decoder_w) b = dy.parameter(self.decoder_b) w1_pre = dy.parameter(self.attention_w1_pre) h_pre = dy.concatenate_cols(pre_encoded) w1dt_pre = None w1_pos = dy.parameter(self.attention_w1_pos) h_pos = dy.concatenate_cols(pos_encoded) w1dt_pos = None last_output_embeddings = self.output_lookup[self.output2int[self.EOS]] try: entity_embedding = self.input_lookup[self.input2int[entity]] except: entity_embedding = self.input_lookup[self.input2int[self.EOS]] s = self.dec_lstm.initial_state().add_input( dy.concatenate([ dy.vecInput(self.STATE_SIZE * 2), last_output_embeddings, entity_embedding ])) out = [] count_EOS = 0 for i in range(self.config['GENERATION']): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt_pre = w1dt_pre or w1_pre * h_pre w1dt_pos = w1dt_pos or w1_pos * h_pos attention_pre = self.attend(h_pre, s, w1dt_pre, self.attention_w2_pre, self.attention_v_pre) attention_pos = self.attend(h_pos, s, w1dt_pos, self.attention_w2_pos, self.attention_v_pos) vector = dy.concatenate([ self.hier_attend(attention_pre, attention_pos, s), last_output_embeddings, entity_embedding ]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_word = probs.index(max(probs)) last_output_embeddings = self.output_lookup[next_word] if self.int2output[next_word] == self.EOS: count_EOS += 1 continue out.append(self.int2output[next_word]) return out
def pre_order_parse(self, words, oracle_actions, oracle_tokens, buffer, stack_top, action_top): stack = [] #check if a reduce is allowed reducable = 0 #check if a reduced has ever been performed reduced = 0 #check if nt/ter actions are allowed nt_allowed = 1 ter_allowed = 1 output_actions = [] output_tokens = [] #the first action is always NT and the first token ROOT action = self.act_vocab[oracle_actions.pop(0)] nt = self.nt_vocab[oracle_tokens.pop(0)] #recursively generate the tree until constrains are met while not (len(stack) == 1 and reduced != 0): valid_actions = [] if len(stack) == 0: valid_actions += [_NT] if len(stack) >= 1: if ter_allowed == 1: valid_actions += [_TER] if nt_allowed == 1: valid_actions += [_NT] if len(stack) >= 2 and reducable != 0: valid_actions += [_ACT] word_weights = None action = valid_actions[0] if len(valid_actions) > 1 or (len(stack) > 0 and valid_actions[0] != _ACT): stack_embedding = stack[-1][0].output() action_summary = action_top.output() word_weights = self.attention(stack_embedding, buffer) buffer_embedding = dy.esum([ vector * attterion_weight for vector, attterion_weight in zip(buffer, word_weights) ]) for i in range(len(stack)): if stack[len(stack) - 1 - i][1] == 'p': parent_embedding = stack[len(stack) - 1 - i][2] break parser_state = dy.concatenate([ buffer_embedding, stack_embedding, parent_embedding, action_summary ]) h = self.mlp_layer(parser_state) log_probs = dy.log_softmax(self.act_proj_layer(h), valid_actions) action = max(enumerate(log_probs.vec_value()), key=itemgetter(1))[0] if action == _NT: if word_weights is not None: #no prediction is made for ROOT log_probs_nt = dy.log_softmax(self.nt_proj_layer(h)) nt = max(enumerate(log_probs_nt.vec_value()), key=itemgetter(1))[0] nt_embedding = self.nt_input_layer(self.nt_lookup[nt]) stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) stack_state = stack_state.add_input(nt_embedding) stack.append((stack_state, 'p', nt_embedding)) output_actions.append(self.act_vocab.token(action)) output_tokens.append(self.nt_vocab.token(nt)) elif action == _TER: log_probs_ter = dy.log_softmax(self.ter_proj_layer(h)) ter = max(enumerate(log_probs_ter.vec_value()), key=itemgetter(1))[0] ter_embedding = self.ter_input_layer(self.ter_lookup[ter]) stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) stack_state = stack_state.add_input(ter_embedding) stack.append((stack_state, 'c', ter_embedding)) output_actions.append(self.act_vocab.token(action)) output_tokens.append(self.ter_vocab.token(ter)) else: found_p = 0 path_input = [] while found_p != 1: top = stack.pop() top_raw_rep, top_label, top_rep = top[2], top[1], top[0] path_input.append(top_raw_rep) if top_label == 'p' or top_label == 'ROOT': found_p = 1 parent_rep = path_input.pop() composed_rep = self.subtree_input_layer( dy.concatenate([dy.average(path_input), parent_rep])) stack_state, _, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) stack_state = stack_state.add_input(composed_rep) stack.append((stack_state, 'c', composed_rep)) reduced = 1 output_actions.append(self.act_vocab.token(action)) action_embedding = self.act_input_layer(self.act_lookup[action]) action_top = action_top.add_input(action_embedding) reducable = 1 nt_allowed = 1 ter_allowed = 1 #reduce cannot follow nt if stack[-1][1] == 'p' or stack[-1][1] == 'ROOT': reducable = 0 #nt is disabled if maximum open non-terminal allowed is reached count_p = 0 for item in stack: if item[1] == 'p': count_p += 1 if count_p >= 10: nt_allowed = 0 #ter is disabled if maximum children under the open nt is reached count_c = 0 for item in stack[::-1]: if item[1] == 'c': count_c += 1 else: break if count_c >= 10: ter_allowed = 0 return output_actions, output_tokens
def beam_search(self, pre_context, pos_context, entity, beam): embedded = self.embed_sentence(pre_context) pre_encoded = self.encode_sentence(self.encpre_fwd_lstm, self.encpre_bwd_lstm, embedded) embedded = self.embed_sentence(pos_context) pos_encoded = self.encode_sentence(self.encpos_fwd_lstm, self.encpos_bwd_lstm, embedded) w = dy.parameter(self.decoder_w) b = dy.parameter(self.decoder_b) w1_pre = dy.parameter(self.attention_w1_pre) h_pre = dy.concatenate_cols(pre_encoded) w1dt_pre = None w1_pos = dy.parameter(self.attention_w1_pos) h_pos = dy.concatenate_cols(pos_encoded) w1dt_pos = None try: entity_embedding = self.input_lookup[self.input2int[entity]] except: entity_embedding = self.input_lookup[self.input2int[self.EOS]] last_output_embeddings = self.output_lookup[self.output2int[self.EOS]] s = self.dec_lstm.initial_state().add_input( dy.concatenate([ dy.vecInput(self.STATE_SIZE * 2), last_output_embeddings, entity_embedding ])) candidates = [{ 'sentence': [self.EOS], 'prob': 0.0, 'count_EOS': 0, 's': s }] outputs = [] i = 0 while i < self.config['GENERATION'] and len(outputs) < beam: new_candidates = [] for candidate in candidates: if candidate['count_EOS'] == 2: outputs.append(candidate) if len(outputs) == beam: break else: # w1dt can be computed and cached once for the entire decoding phase w1dt_pre = w1dt_pre or w1_pre * h_pre w1dt_pos = w1dt_pos or w1_pos * h_pos attention_pre = self.attend(h_pre, candidate['s'], w1dt_pre, self.attention_w2_pre, self.attention_v_pre) attention_pos = self.attend(h_pos, candidate['s'], w1dt_pos, self.attention_w2_pos, self.attention_v_pos) last_output_embeddings = self.output_lookup[ self.output2int[candidate['sentence'][-1]]] vector = dy.concatenate([ self.hier_attend(attention_pre, attention_pos, candidate['s']), last_output_embeddings, entity_embedding ]) s = candidate['s'].add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_words = [{ 'prob': e, 'index': probs.index(e) } for e in sorted(probs, reverse=True)[:beam]] for next_word in next_words: word = self.int2output[next_word['index']] new_candidate = { 'sentence': candidate['sentence'] + [word], 'prob': candidate['prob'] + np.log(next_word['prob']), 'count_EOS': candidate['count_EOS'], 's': s } if word == self.EOS: new_candidate['count_EOS'] += 1 new_candidates.append(new_candidate) candidates = sorted(new_candidates, key=lambda x: x['prob'], reverse=True)[:beam] i += 1 if len(outputs) == 0: outputs = candidates # Length Normalization alpha = 0.6 for output in outputs: length = len(output['sentence']) lp_y = ((5.0 + length)**alpha) / ((5.0 + 1.0)**alpha) output['prob'] = output['prob'] / lp_y outputs = sorted(outputs, key=lambda x: x['prob'], reverse=True) return list(map(lambda x: x['sentence'], outputs))
def span_parse(self, words, oracle_actions, oracle_tokens, buffer, stack_top, action_top): stack = [] losses = [] output_actions = [] output_tokens = [] nt_allowed = 1 found_root = 0 consecutive_nt = 0 consecutive_ter = 0 total_ter = 0 _max_ter = len(words) _root = self.nt_vocab[oracle_tokens[-1]] #recursively generate the tree until training data is exhausted while not (found_root): valid_actions = [] if len(stack) == 0: valid_actions += [_TER] if len(stack ) >= 1 and consecutive_ter <= 5 and total_ter <= _max_ter: valid_actions += [_TER] if len(stack) >= 2: valid_actions += [_ACT] if len(stack) >= 1 and consecutive_nt <= 10: valid_actions += [_NT] if len(valid_actions) == 0: break action = valid_actions[0] #we make predictions when stack is not empty and _ACT is not the only valid action stack_embedding = stack[-1][0].output( ) if stack else self.initial_embedding() action_summary = action_top.output( ) if len(stack) > 0 else self.initial_embedding() word_weights = self.attention(stack_embedding, buffer) buffer_embedding = dy.esum([ vector * attterion_weight for vector, attterion_weight in zip(buffer, word_weights) ]) parser_state = dy.concatenate( [buffer_embedding, stack_embedding, action_summary]) h = self.mlp_layer(parser_state) if len(valid_actions) > 0: log_probs = dy.log_softmax(self.act_proj_layer(h), valid_actions) assert action in valid_actions, "action not in scope" action = max(enumerate(log_probs.vec_value()), key=itemgetter(1))[0] if action == _NT: #label span log_probs_nt = dy.log_softmax(self.nt_proj_layer(h)) nt = max(enumerate(log_probs_nt.vec_value()), key=itemgetter(1))[0] if nt == _root: found_root = 1 stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) parent_rep = self.nt_input_layer(self.nt_lookup[nt]) top = stack.pop() top_raw_rep, top_label, top_rep = top[2], top[1], top[0] composed_rep = self.subtree_input_layer( dy.concatenate([top_raw_rep, parent_rep])) stack_state = stack_state.add_input(composed_rep) stack.append((stack_state, 'p', composed_rep)) consecutive_nt += 1 consecutive_ter = 0 output_actions.append(self.act_vocab.token(action)) output_tokens.append(self.nt_vocab.token(nt)) elif action == _TER: #generate terminal log_probs_ter = dy.log_softmax(self.ter_proj_layer(h)) ter = max(enumerate(log_probs_ter.vec_value()), key=itemgetter(1))[0] stack_state, label, _ = stack[-1] if stack else (stack_top, 'ROOT', stack_top) ter_embedding = self.ter_input_layer(self.ter_lookup[ter]) stack_state = stack_state.add_input(ter_embedding) stack.append((stack_state, 'c', ter_embedding)) consecutive_nt = 0 consecutive_ter += 1 total_ter += 1 output_actions.append(self.act_vocab.token(action)) output_tokens.append(self.ter_vocab.token(ter)) else: #extend span assert len(stack) >= 2 top2 = stack.pop() top1 = stack.pop() top2_raw_rep = top2[2] top1_raw_rep = top1[2] span_rep = self.span_input_layer( dy.concatenate([top2_raw_rep, top1_raw_rep])) stack_state = stack_state.add_input(span_rep) stack.append((stack_state, 'c', span_rep)) consecutive_nt = 0 consecutive_ter = 0 output_actions.append(self.act_vocab.token(action)) action_embedding = self.act_input_layer(self.act_lookup[action]) action_top = action_top.add_input(action_embedding) return output_actions, output_tokens
def run(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) if isTrain or arc_targets is not None: mask_1D = dynet_flatten_numpy(mask) # batched here means that the last dim is treated as batch dimension, both in input and output mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) # TODO: 注意 _words_in_train # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len if self.pre_train_emb: word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + dy.lookup_batch(self.pret_word_embs, w, update=False) for w in word_inputs ] # 两个 embedding 相加 [Expression] * seq_len else: word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep, head = leaky_relu(dy.affine_transform([ b_dep, W_dep, top_recur ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) # 1 就意味着某些情况下整个 dim 1 变成0, dim=0 就是 drop 列, dim=1 就是 drop 行, 第三维是 batch dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # 这种风格的平坦是为了计算 loss 啦 # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) # seq_len x batch_size if isTrain or arc_targets is not None: # 用得分最高的去计算 loss, 并不意味着我就选这个作为解码结果的哦, 但是必须削减它 arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask # mask 你真厉害呀现在还活着 arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D # 这里的形状如此, 需要用 mask1d rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by ones # 我非常赞同, parse 的解码这一部分根本没法 batch msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) # 难道第 0 个真的是 ROOT, 确实如此 if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def beam_search_decode( self, input_: str, encoded_input: List[int], beam_width: int, external_cg: bool = True, ): if not external_cg: dy.renew_cg() input_emb = self.input_embedding(encoded_input, is_training=False) bidirectional_emb = self.bidirectional_encoding(input_emb)[ 1:] # drop BEGIN_WORD input_length = len(bidirectional_emb) decoder = self.dec.initial_state() beam: List[Hypothesis] = [ Hypothesis( action_history=[BEGIN_WORD], alignment=0, decoder=decoder, negative_log_p=0.0, output=[], ) ] hypothesis_length = 0 complete_hypotheses = [] while (beam and beam_width > 0 and hypothesis_length <= MAX_ACTION_SEQ_LEN): expansions: List[Hypothesis] = [] for hypothesis in beam: length_encoder_suffix = input_length - hypothesis.alignment valid_actions = self.compute_valid_actions( length_encoder_suffix) # decoder decoder_input = dy.concatenate([ bidirectional_emb[hypothesis.alignment], self.act_lookup[hypothesis.action_history[-1]], ]) decoder = hypothesis.decoder.add_input(decoder_input) # classifier logits = self.pW * decoder.output() + self.pb log_probs_expr = dy.log_softmax(logits, valid_actions) log_probs = log_probs_expr.npvalue() for action in valid_actions: log_p = (hypothesis.negative_log_p - log_probs[action] ) # min heap, so minus heapq.heappush( expansions, Expansion(action, decoder, hypothesis, log_p), ) beam: List[Hypothesis] = [] for _ in range(beam_width): expansion: Expansion = heapq.heappop(expansions) from_hypothesis = expansion.from_hypothesis action = expansion.action action_history = list(from_hypothesis.action_history) action_history.append(action) output = list(from_hypothesis.output) # execute the action to update the transducer state action = self.vocab.decode_action(action) if isinstance(action, EndOfSequence): # 1. COMPLETE HYPOTHESIS, REDUCE BEAM complete_hypothesis = Output( action_history=action_history, output="".join(output), log_p=-expansion.negative_log_p, ) # undo min heap minus complete_hypotheses.append(complete_hypothesis) beam_width -= 1 else: # 2. EXECUTE ACTION AND ADD FULL HYPOTHESIS TO NEW BEAM alignment = from_hypothesis.alignment if isinstance(action, ConditionalCopy): char_ = input_[alignment] alignment += 1 output.append(char_) elif isinstance(action, ConditionalDel): alignment += 1 elif isinstance(action, ConditionalIns): output.append(action.new) elif isinstance(action, ConditionalSub): alignment += 1 output.append(action.new) else: raise ValueError(f"Unknown action: {action}.") hypothesis = Hypothesis( action_history=action_history, alignment=alignment, decoder=expansion.decoder, negative_log_p=expansion.negative_log_p, output=output, ) beam.append(hypothesis) hypothesis_length += 1 if not complete_hypotheses: # nothing found because the model is very bad for hypothesis in beam: complete_hypothesis = Output( action_history=hypothesis.action_history, output="".join(hypothesis.output), log_p=-hypothesis.negative_log_p, ) # undo min heap minus complete_hypotheses.append(complete_hypothesis) complete_hypotheses.sort(reverse=True) return complete_hypotheses
def build_tagging_graph(self, words): # parameters -> expressions self.w1 = dy.parameter(self.W1) self.b1 = dy.parameter(self.B1) ############################### self.xw1 = dy.parameter(self.xW1) self.xb1 = dy.parameter(self.xB1) self.xw2 = dy.parameter(self.xW2) self.xb2 = dy.parameter(self.xB2) # apply dropout if self.eval: self.disable_dropout() else: self.enable_dropout() # initialize the RNNs f_init = self.fwdRNN.initial_state() b_init = self.bwdRNN.initial_state() f2_init = self.fwdRNN2.initial_state() b2_init = self.bwdRNN2.initial_state() self.cf_init = self.cfwdRNN.initial_state() self.cb_init = self.cbwdRNN.initial_state() xf_init = self.xfwdRNN.initial_state() xb_init = self.xbwdRNN.initial_state() xf2_init = self.xfwdRNN2.initial_state() xb2_init = self.xbwdRNN2.initial_state() self.xcf_init = self.xcfwdRNN.initial_state() self.xcb_init = self.xcbwdRNN.initial_state() # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. wembs = [self.word_rep(w) for w in words] cembs = [self.char_rep(w, self.cf_init, self.cb_init) for w in words] xembs = [dy.concatenate([w, c]) for w, c in zip(wembs, cembs)] # feed word vectors into biLSTM fw_exps = f_init.transduce(xembs) bw_exps = b_init.transduce(reversed(xembs)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] # feed word vectors into biLSTM fw_exps = f2_init.transduce(bi_exps) bw_exps = b2_init.transduce(reversed(bi_exps)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] # feed each biLSTM state to an MLP exps = [] pos_hidden = [] for xi in bi_exps: xh = self.w1 * xi #xh = dy.tanh(xh) + self.b1 pos_hidden.append(xh) cembs = [self.char_rep(w, self.xcf_init, self.xcb_init) for w in words] xembs = [ dy.concatenate(list(wcp)) for wcp in zip(wembs, cembs, pos_hidden) ] xfw_exps = xf_init.transduce(xembs) xbw_exps = xb_init.transduce(reversed(xembs)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(xfw_exps, reversed(xbw_exps)) ] # feed word vectors into biLSTM fw_exps = xf2_init.transduce(bi_exps) bw_exps = xb2_init.transduce(reversed(bi_exps)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] exps = [] for xi in bi_exps: xh = self.xw1 * xi xh = self.meta.activation(xh) + self.xb1 xo = self.xw2 * xh + self.xb2 exps.append(xo) return exps
def __call__(self, words_sequence, word2int, vocab, dataset="train"): # get prefix and suffix and sum them up with the word def add_sub_words_embd(word, word_embed): if len(word) <= 3: return dy.esum([word_embed]) else: pref = False suff = False # check if prefix exist in F2I. relevant for test/dev sets if word2int.has_key(word[:3]): prefix_embd = lookup[word2int.get(word[:3])] pref = True # check if suffix exist in F2I. relevant for test/dev sets if word2int.has_key(word[-3:]): suffix_embd = lookup[word2int.get(word[-3:])] suff = True # sum vectors of word with existing prefix/suffix if pref and suff: sum_embd = dy.esum([prefix_embd, suffix_embd, word_embed]) elif pref and suff == False: sum_embd = dy.esum([prefix_embd, word_embed]) elif suff and pref == False: sum_embd = dy.esum([suffix_embd, word_embed]) else: sum_embd = dy.esum([word_embed]) return sum_embd lookup = self.params["lookup"] sequence = [] if dataset == "train": for word, label in words_sequence: char_embed = [] if word not in vocab: # for words not in vocab get char embeddings word_chars = list(word) for ch in word_chars: char_embed.append(lookup[word2int.get(ch)]) s = dy.esum(char_embed) sequence.append(add_sub_words_embd(word, s)) else: word_embed = lookup[word2int.get(word)] sequence.append(add_sub_words_embd(word, word_embed)) else: for word in words_sequence: char_embed = [] if word not in vocab: # for words not in vocab get char embeddings word_chars = list(word) for ch in word_chars: char_embed.append(lookup[word2int.get(ch)]) s = dy.esum(char_embed) sequence.append(add_sub_words_embd(word, s)) else: word_embed = lookup[word2int.get(word)] sequence.append(add_sub_words_embd(word, word_embed)) # convert the parameter into an Expession (add it to graph) W = dy.parameter(self.params["W"]) b = dy.parameter(self.params["b"]) fw_lstm1 = self.fw_builder1.initial_state() bw_lstm1 = self.bw_builder1.initial_state() fw_lstm2 = self.fw_builder2.initial_state() bw_lstm2 = self.bw_builder2.initial_state() # get output vectors of all time steps for the first bi-lstm fw_lstm1_output = fw_lstm1.transduce(sequence) bw_lstm1_output = bw_lstm1.transduce(reversed(sequence)) # concatenate backward vector to forward vector per each word bi1_output = [ dy.concatenate([fw1, bw1]) for fw1, bw1 in zip(fw_lstm1_output, reversed(bw_lstm1_output)) ] # get output vectors of all time steps for the second bi-lstm fw_lstm2_output = fw_lstm2.transduce(bi1_output) bw_lstm2_output = bw_lstm2.transduce(reversed(bi1_output)) # concatenate backward vector to forward vector per each 1st biLSTM vector bi2_output = [ dy.concatenate([fw2, bw2]) for fw2, bw2 in zip(fw_lstm2_output, reversed(bw_lstm2_output)) ] # calc net output net_output = [dy.softmax(W * out + b) for out in bi2_output] return net_output
def transduce(self, es: ExpressionSequence) -> ExpressionSequence: """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one, to both the forward and backward RNNs, and concatenating. Args: es: an ExpressionSequence """ es_list = [es] for layer_i, (fb, bb) in enumerate(self.builder_layers): reduce_factor = self._reduce_factor_for_layer(layer_i) if es_list[0].mask is None: mask_out = None else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor) if self.downsampling_method == "concat" and len( es_list[0]) % reduce_factor != 0: raise ValueError( f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, " f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. " f"Set Batcher's pad_src_to_multiple argument accordingly.") fs = fb.transduce(es_list) bs = bb.transduce( [ReversedExpressionSequence(es_item) for es_item in es_list]) if layer_i < len(self.builder_layers) - 1: if self.downsampling_method == "skip": es_list = [ ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out), ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out) ] elif self.downsampling_method == "concat": es_len = len(es_list[0]) es_list_fwd = [] es_list_bwd = [] for i in range(0, es_len, reduce_factor): for j in range(reduce_factor): if i == 0: es_list_fwd.append([]) es_list_bwd.append([]) es_list_fwd[j].append(fs[i + j]) es_list_bwd[j].append(bs[len(es_list[0]) - reduce_factor + j - i]) es_list = [ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \ [ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)] else: raise RuntimeError( f"unknown downsampling_method {self.downsampling_method}" ) else: # concat final outputs ret_es = ExpressionSequence(expr_list=[ dy.concatenate([f, b]) for f, b in zip(fs, ReversedExpressionSequence(bs)) ], mask=mask_out) self._final_states = [FinalTransducerState(dy.concatenate([fb.get_final_states()[0].main_expr(), bb.get_final_states()[0].main_expr()]), dy.concatenate([fb.get_final_states()[0].cell_expr(), bb.get_final_states()[0].cell_expr()])) \ for (fb, bb) in self.builder_layers] return ret_es
def __call__(self, a, b, c): enc = [dy.rectify(self.a_mlp(a)), # HOTFIX rectify here? dy.rectify(self.b_mlp(b)), dy.rectify(self.c_mlp(c))] enc = [dy.concatenate([dy.scalarInput(1), x]) for x in enc] return self.multilinear(*enc)
def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda: {})): if self.elmo: sentence_text = " ".join([entry.form for entry in sentence[:-1]]) elmo_sentence_representation = \ self.elmo.get_sentence_representation(sentence_text) for i, root in enumerate(sentence): root.vecs = defaultdict( lambda: None ) # all vecs are None by default (possibly a little risky?) if options.word_emb_size > 0: if train: word_count = float(self.word_counts.get(root.norm, 0)) dropFlag = random.random() > word_count / (0.25 + word_count) root.vecs["word"] = self.word_lookup[ self.words.get(root.norm, 0) if not dropFlag else 0] else: # need to check in test_embeddings at prediction time if root.norm in self.words: root.vecs["word"] = self.word_lookup[self.words[ root.norm]] elif root.norm in test_embeddings["words"]: root.vecs["word"] = dy.inputVector( test_embeddings["words"][root.norm]) else: root.vecs["word"] = self.word_lookup[0] if options.pos_emb_size > 0: root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)] if options.char_emb_size > 0: root.vecs["char"] = self.get_char_vector( root, train, test_embeddings["chars"]) if options.tbank_emb_size > 0: if options.forced_tbank_emb: treebank_id = options.forced_tbank_emb elif root.proxy_tbank: treebank_id = root.proxy_tbank else: treebank_id = root.treebank_id # this is a bit of a hack for models trained on an old version of the code # that used treebank name rather than id as the lookup if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \ utils.reverse_iso_dict[treebank_id] in self.treebanks: treebank_id = utils.reverse_iso_dict[treebank_id] root.vecs["treebank"] = self.treebank_lookup[ self.treebanks[treebank_id]] if self.elmo: if i < len(sentence) - 1: # Don't look up the 'root' word root.vecs["elmo"] = elmo_sentence_representation[i] else: # TODO root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim) root.vec = dy.concatenate( filter(None, [ root.vecs["word"], root.vecs["elmo"], root.vecs["pos"], root.vecs["char"], root.vecs["treebank"] ])) for bilstm in self.bilstms: bilstm.set_token_vecs(sentence, train)
def encode_sents(look, fwd, bwd, sents): embs = [[look[x] for x in sent] for sent in sents] return [ dy.concatenate([fwd.transduce(x)[-1], bwd.transduce(x)[-1]]) for x in embs ]
def step(self, instances, enable_dropout=True): dy.renew_cg() if enable_dropout: self.l2r_builder.set_dropout(0.5) self.r2l_builder.set_dropout(0.5) self.dec_builder.set_dropout(0.5) else: self.l2r_builder.disable_dropout() self.r2l_builder.disable_dropout() self.dec_builder.disable_dropout() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W1_att_f = dy.parameter(self.W1_att_f) W1_att_e = dy.parameter(self.W1_att_e) w2_att = dy.parameter(self.w2_att) #instances : a list [(src0,tgt0),(src1,tgt1),(src2,tgt2)] maxLen = max(map(lambda x: len(x[1]), instances)) src_sents = [] src_sents_rev = [] tgt_sents = [] srcSenLen = len( instances[0][0]) + 2 #the length of the src sentence, all the same tgtSenLen = maxLen + 1 masks = [ [] for i in range(tgtSenLen) ] #mask for each position. each item in this list is a list with length=batchsize num_words = 0 for item in instances: #item[0]:src ; item[1]:tgt num_words += (len(item[1]) + 1) padNum = maxLen - len(item[1]) for i in range(len(item[1]) + 1): masks[i].append(1) for i in range(len(item[1]) + 1, tgtSenLen): masks[i].append(0) thisSrc = [startSymbol] + item[0] + [endSymbol] src_sents.append(thisSrc) src_sents_rev.append(list(reversed(thisSrc))) thisTgt = [startSymbol ] + item[1] + [endSymbol for i in range(padNum + 1)] tgt_sents.append(thisTgt) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for i in range(srcSenLen): batchSrc = dy.lookup_batch( self.src_lookup, [self.src_token_to_id[x[i]] for x in src_sents]) batchSrc_rev = dy.lookup_batch( self.src_lookup, [self.src_token_to_id[x[i]] for x in src_sents_rev]) l2r_state = l2r_state.add_input(batchSrc) r2l_state = r2l_state.add_input(batchSrc_rev) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() # Combine the left and right representations for every word h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) losses = [] # Decoder c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate([ dy.lookup_batch(self.tgt_lookup, [self.tgt_token_to_id['</S>'] for i in tgt_sents]), c_t ]) dec_state = self.dec_builder.initial_state().add_input(start) #loss = dy.pickneglogsoftmax_batch(W_y * dec_state.output() + b_y,[self.tgt_token_to_id[tgt_sent[0]] for tgt_sent in tgt_sents]) #losses.append(loss) for i in range(tgtSenLen): #cw : item[i] nw:item[i+1] h_e = dec_state.output() c_t = self.__attention_mlp(h_fs_matrix, h_e, 1)[0] # Get the embedding for the current target word embed_t = dy.lookup_batch( self.tgt_lookup, [self.tgt_token_to_id[tgt_sent[i]] for tgt_sent in tgt_sents]) # Create input vector to the decoder x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) o_en = dec_state.output() if enable_dropout: o_en = dy.dropout(o_en, 0.5) loss = dy.pickneglogsoftmax_batch(W_y * o_en + b_y, [ self.tgt_token_to_id[tgt_sent[i + 1]] for tgt_sent in tgt_sents ]) thisMask = dy.inputVector(masks[i]) thisMask = dy.reshape(thisMask, (1, ), len(instances)) losses.append(loss * thisMask) return dy.sum_batches(dy.esum(losses)), num_words
def translate_sentence(self, sent): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W1_att_f = dy.parameter(self.W1_att_f) W1_att_e = dy.parameter(self.W1_att_e) w2_att = dy.parameter(self.w2_att) sent = [startSymbol] + sent + [endSymbol] sent_rev = list(reversed(sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(sent, sent_rev): l2r_state = l2r_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r])) r2l_state = r2l_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l])) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) # Decoder trans_sentence = [startSymbol] cw = trans_sentence[-1] #initial context c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate( [dy.lookup(self.tgt_lookup, self.tgt_token_to_id[endSymbol]), c_t]) dec_state = self.dec_builder.initial_state().add_input(start) while len(trans_sentence) < self.max_len: h_e = dec_state.output() getAttention = self.__attention_mlp(h_fs_matrix, h_e, 0) c_t = getAttention[0] embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw]) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) y_star = dy.softmax(W_y * dec_state.output() + b_y).vec_value() next_wordID = np.argmax(y_star) cw = self.tgt_id_to_token[next_wordID] cpcw = cw #store the original word for computing next word if cw == unkSymbol: #find the source word with highest attention score keyWord = sent[getAttention[1]] if self.src_token_to_id[keyWord] == self.src_token_to_id[ unkSymbol]: cw = keyWord #special word . simply pass it source word out else: #find the target word with second max prob #prob: y_star next_wordID = np.argpartition(y_star, 1)[1] cw = self.tgt_id_to_token[next_wordID] if cw == endSymbol: break if cw != startSymbol: trans_sentence.append(cw) cw = cpcw #get the original cw return ' '.join(trans_sentence[1:])
def batch_predict_next_best_action(self,config_batched,prev_action_batched,sentence_batch): """ Predicts greedily the next transition for a batch of configs, actions leading to that config,and related sentences @param config_batched: a list of configurations @param prev_action_batched: a list of actions (or None if no prev actions) @param sentence_batch: a list of sentences @return a list of new configurations, a list of actions generating these new configs """ B = len(config_batched) idxes = list(range(B)) new_configs = [None] * B new_actions = [None] * B if prev_action_batched is None: prev_action_batched = [None]*B #(1) sort out the lexical and structural batches def is_lexical(config): S,F,B,A,prefix_score = config return F is None and len(B) > 0 lexical_idxes = [idx for idx in idxes if is_lexical(config_batched[idx])] structural_idxes = [idx for idx in idxes if not is_lexical(config_batched[idx])] #(2) lexical predictions if len(lexical_idxes) > 0: def make_ref_lex_action(config,sentence): S,F,B,A,prefix_score = config return (ArcEagerGenerativeParser.GENERATE,sentence[B[0]]) X = [] Y = [] for idx in lexical_idxes: x,y = self.make_representation(config_batched[idx],make_ref_lex_action(config_batched[idx],sentence_batch[idx]),sentence_batch[idx],structural=False) X.append(x) Y.append(y) Xt = zip(*X) #transpose if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) embeddings = [dy.pick_batch(E, xcol) for xcol in Xt] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(E * dy.tanh( W * xdense ),Y).npvalue()[0] else: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) O = dy.parameter(self.output_embeddings) embeddings = [dy.pick_batch(E, xcol) for xcol in Xt] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(O * dy.tanh( W * xdense ),Y).npvalue()[0] preds = np.atleast_1d(preds) for pred_score,idx in zip(preds,lexical_idxes): new_configs[idx] = self.generate(config_batched[idx],local_score= -pred_score)# execs the actions new_actions[idx] = (ArcEagerGenerativeParser.GENERATE,sentence_batch[idx][config_batched[idx][2][0]]) #(3) structural predictions if len(structural_idxes) > 0 : action_masks = np.array([self.mask_actions(config_batched[idx],prev_action_batched[idx],len(sentence_batch[idx])) for idx in structural_idxes]) X = [self.make_representation(config_batched[idx],None,sentence_batch[idx],structural=True) for idx in structural_idxes] Xt = zip(*X) #transpose dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) A = dy.parameter(self.action_weights) embeddings = [dy.pick_batch(E, xcol) for xcol in Xt] xdense = dy.concatenate(embeddings) preds = dy.softmax(A * dy.tanh( W * xdense )).npvalue().transpose() max_idxes = np.argmax(preds * action_masks,axis=1) max_scores = np.log(preds[np.arange(preds.shape[0]),max_idxes]) for argmax_idx,max_score,idx in zip(max_idxes,max_scores,structural_idxes): new_configs[idx] = self.actions[argmax_idx](config_batched[idx],local_score=max_score) #execs the actions new_actions[idx] = self.rev_action_codes[argmax_idx] return (new_configs, new_actions)