def decode_loss(self, src_encodings, tgt_seqs): """ :param tgt_seqs: (tgt_heads, tgt_labels): list (length=batch_size) of (src_len) """ # todo(NOTE): Sentences should start with empty token (as root of dependency tree)! tgt_heads, tgt_labels = tgt_seqs src_len = len(tgt_heads[0]) batch_size = len(tgt_heads) np_tgt_heads = np.array(tgt_heads).flatten() # (src_len * batch_size) np_tgt_labels = np.array(tgt_labels).flatten() s_arc, s_label = self.cal_scores(src_encodings) # (src_len, src_len, bs), ([(src_len, src_len, bs)]) s_arc_value = s_arc.npvalue() s_arc_choice = np.argmax(s_arc_value, axis=0).transpose().flatten() # (src_len * batch_size) s_pick_labels = [dy.pick_batch(dy.reshape(score, (src_len,), batch_size=src_len * batch_size), s_arc_choice) for score in s_label] s_argmax_labels = dy.concatenate(s_pick_labels, d=0) # n_labels, src_len * batch_size reshape_s_arc = dy.reshape(s_arc, (src_len,), batch_size=src_len * batch_size) arc_loss = dy.pickneglogsoftmax_batch(reshape_s_arc, np_tgt_heads) label_loss = dy.pickneglogsoftmax_batch(s_argmax_labels, np_tgt_labels) loss = dy.sum_batches(arc_loss + label_loss) / batch_size return loss
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output() #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def calc_sent_loss(sent, dropout=0.0): # Create a computation graph dy.renew_cg() # The initial history is equal to end of sentence symbols hist = [S] * N # Step through the sentence, including the end of sentence token all_histories = [] all_targets = [] for next_word in sent + [S]: all_histories.append(list(hist)) all_targets.append(next_word) hist = hist[1:] + [next_word] s = calc_score_of_histories(all_histories, dropout=dropout) return dy.sum_batches(dy.pickneglogsoftmax_batch(s, all_targets))
def BuildLMGraph(self, sents): dy.renew_cg() # initialize the RNN init_state = self.builder.initial_state() # parameters -> expressions R = dy.parameter(self.R) bias = dy.parameter(self.bias) S = vocab.w2i["<s>"] # get the cids and masks for each step tot_chars = 0 cids = [] masks = [] for i in range(len(sents[0])): cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent)>i else 0) for sent in sents] masks.append(mask) tot_chars += sum(mask) # start the rnn with "<s>" init_ids = cids[0] s = init_state.add_input(lookup_batch(self.lookup, init_ids)) losses = [] # feed char vectors into the RNN and predict the next char for cid, mask in zip(cids[1:], masks[1:]): score = dy.affine_transform([bias, R, s.output()]) loss = dy.pickneglogsoftmax_batch(score, cid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN cemb = dy.lookup_batch(self.lookup, cid) s = s.add_input(cemb) return dy.sum_batches(dy.esum(losses)), tot_chars
def calc_loss(self, mlp_dec_state, ref_action): scores = self.get_scores(mlp_dec_state) if self.label_smoothing == 0.0: # single mode if not xnmt.batcher.is_batched(ref_action): return dy.pickneglogsoftmax(scores, ref_action) # minibatch mode else: return dy.pickneglogsoftmax_batch(scores, ref_action) else: log_prob = dy.log_softmax(scores) if not xnmt.batcher.is_batched(ref_action): pre_loss = -dy.pick(log_prob, ref_action) else: pre_loss = -dy.pick_batch(log_prob, ref_action) ls_loss = -dy.mean_elems(log_prob) loss = ((1 - self.label_smoothing) * pre_loss) + (self.label_smoothing * ls_loss) return loss
def update_batch(self, words_batch, tags_batch): dynet.renew_cg() length = max(len(words) for words in words_batch) word_ids = np.zeros((length, len(words_batch)), dtype='int32') for j, words in enumerate(words_batch): for i, word in enumerate(words): word_ids[i, j] = self.vw.w2i.get(word, self.UNK) tag_ids = np.zeros((length, len(words_batch)), dtype='int32') for j, tags in enumerate(tags_batch): for i, tag in enumerate(tags): tag_ids[i, j] = self.vt.w2i.get(tag, self.UNK) wembs = [ dynet.lookup_batch(self._E, word_ids[i]) for i in range(length) ] wembs = [dynet.noise(we, 0.1) for we in wembs] f_state = self._fwd_lstm.initial_state() b_state = self._bwd_lstm.initial_state() fw = [x.output() for x in f_state.add_inputs(wembs)] bw = [x.output() for x in b_state.add_inputs(reversed(wembs))] H = dynet.parameter(self._pH) O = dynet.parameter(self._pO) errs = [] for i, (f, b) in enumerate(zip(fw, reversed(bw))): f_b = dynet.concatenate([f, b]) r_t = O * (dynet.tanh(H * f_b)) err = dynet.pickneglogsoftmax_batch(r_t, tag_ids[i]) errs.append(dynet.sum_batches(err)) sum_errs = dynet.esum(errs) squared = -sum_errs # * sum_errs losses = sum_errs.scalar_value() sum_errs.backward() self._sgd.update() return losses
def calc_lm_loss(sents): dy.renew_cg() # initialize the RNN f_init = RNN.initial_state() # get the wids and masks for each step tot_words = 0 wids = [] masks = [] for i in range(len(sents[0])): wids.append([(sent[i] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent) > i else 0) for sent in sents] masks.append(mask) tot_words += sum(mask) # start the rnn by inputting "<s>" init_ids = [S] * len(sents) s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids)) # feed word vectors into the RNN and predict the next word losses = [] for wid, mask in zip(wids, masks): # calculate the softmax and loss score = dy.affine_transform([b_exp, W_exp, s.output()]) loss = dy.pickneglogsoftmax_batch(score, wid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN wemb = dy.lookup_batch(WORDS_LOOKUP, wid) s = s.add_input(wemb) return dy.sum_batches(dy.esum(losses)), tot_words
def calc_loss(self, x: dy.Expression, y: Union[int, List[int]]) -> dy.Expression: scores = self.calc_scores(x) if self.label_smoothing == 0.0: # single mode if not batchers.is_batched(y): loss = dy.pickneglogsoftmax(scores, y) # minibatch mode else: loss = dy.pickneglogsoftmax_batch(scores, y) else: log_prob = dy.log_softmax(scores) if not batchers.is_batched(y): pre_loss = -dy.pick(log_prob, y) else: pre_loss = -dy.pick_batch(log_prob, y) ls_loss = -dy.mean_elems(log_prob) loss = ((1 - self.label_smoothing) * pre_loss) + (self.label_smoothing * ls_loss) return loss
def run(self, word_inputs, lemma_inputs, tag_inputs, pred_golds, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1,), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] marker = self._vocab.PAD if self._unified else self._vocab.DUMMY mask = np.greater(word_inputs, marker).astype(np.float32) num_tokens = int(np.sum(mask)) word_embs = [dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK) ) for w in word_inputs] pre_embs = [dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs] flag_embs = [dy.lookup_batch(self.flag_embs, np.array(w == i + 1, dtype=np.int) ) for i, w in enumerate(pred_golds)] lemma_embs = [dy.lookup_batch(self.lemma_embs, lemma) for lemma in lemma_inputs] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [dy.concatenate([dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(pos, posm)]) for word, pre, flag, lemma, pos, (wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, emb_masks)] else: emb_inputs = [dy.concatenate([word, pre, flag, lemma, pos]) for word, pre, flag, lemma, pos in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs)] top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_arg, b_arg = dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b) W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter(self.mlp_pred_b) arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur])) # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur])) predicates_1D = pred_golds[0] pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1) pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, pred_recur])) if isTrain: arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp) # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp) pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp) W_rel = dy.parameter(self.rel_W) # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size, # num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True) # # (#pred x rel_size x #arg) x batch_size # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # # (#pred x rel_size) x (#arg x batch_size) # predicates_1D = dynet_flatten_numpy(pred_golds) # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # # (rel_size) x (#arg x batch_size) rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (1 x rel_size x #arg) x batch_size flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size), seq_len * batch_size) # (1 x rel_size) x (#arg x batch_size) predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0]) partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # (1 x rel_size) x (#arg x batch_size) if isTrain: mask_1D = dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype(np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens return rel_accuracy, rel_loss # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), # (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, 1, seq_len, batch_size), 'F')) outputs = [] # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): # msk[0] = 1. # sent_len = int(np.sum(msk)) # rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold] # rel_pred = rel_argmax(rel_prob) # outputs.append(rel_pred[:sent_len]) for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): msk[0] = 1. sent_len = int(np.sum(msk)) rel_prob = rel_prob[np.arange(len(pred_gold)), 0] rel_pred = rel_argmax(rel_prob) outputs.append(rel_pred[:sent_len]) return outputs
def run(self, word_inputs, lengths, tag_inputs, arc_targets=None, rel_targets=None, isTrain=True): batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = (np.broadcast_to(np.reshape(np.arange(seq_len), (seq_len, 1)), (seq_len, batch_size)) < lengths).astype( np.float32) mask[0] = 0. num_tokens = int(np.sum(mask)) if isTrain or arc_targets is not None: mask_1D = self.dynet_flatten_numpy(mask) # batched here means that the last dim is treated as batch dimension, both in input and output mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) # TODO: 注意 _words_in_train # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len if self.e_ext is not None: word_embs = [ dy.lookup_batch( self.e_form, np.where(w < self.v_train, w, self.vocab_form.stoi["<unk>"])) + dy.lookup_batch(self.e_ext, w, update=False) for w in word_inputs ] # 两个 embedding 相加 [Expression] * seq_len else: word_embs = [ dy.lookup_batch( self.e_form, np.where(w < self.v_train, w, self.vocab_form.stoi["<unk>"])) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.e_tag, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_msk(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] top_recur = dy.concatenate_cols( biLSTM(self.lstm_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: # drop some dim for lstm_output for all words, all sentences top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) dep = leaky_relu( dy.affine_transform([self.mlp_dep_b, self.mlp_dep_W, top_recur])) head = leaky_relu( dy.affine_transform([self.mlp_head_b, self.mlp_head_W, top_recur])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) # drop dim k means, it is possible that the whole dim k is set to zeros # for matrix with batch, ((R, C), B) # drop dim 0 means drop some cols, drop dim 1 means drop some rows # drop 2 means drop some batches, and it only supports for Tensor with rank <=3 dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] arc_logits = bilinear(dep_arc, self.arc_W, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # flatten it to compute loss # (#head ) x (#dep x batch_size) arc_preds = np.reshape(arc_logits.npvalue().argmax(0), (seq_len, batch_size)) # seq_len x batch_size # here if an Expression's batch size is 1 # npvalue() will drop the batch dimension # so add it back if needed if isTrain or arc_targets is not None: # tarin it in a neg log likelihood fashion, but enforce tree constraint when testing arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask # mask is used to filter <pad>'s out in summing loss arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = self.dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head, transpose reverse all, and since layout has changed, it's totally fine rel_logits = bilinear(dep_rel, self.rel_W, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=len(self.vocab_deprel), bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, len(self.vocab_deprel)), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = self.dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D # 这里的形状如此, 需要用 mask1d rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (len(self.vocab_deprel), seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * self.dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by ones msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax( rel_prob, sent_len, self.vocab_deprel, "root" if "root" in self.vocab_deprel.stoi else "ROOT") outputs.append( (arc_pred[1:sent_len], rel_pred[1:sent_len])) # w_0 is <roor> assert (len(outputs) == batch_size) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def loss(self, input_, y): if self.batched: return dy.pickneglogsoftmax_batch(input_, y) return dy.pickneglogsoftmax(input_, y)
def calculate_batch_loss(self, batch): dy.renew_cg() W_y = dy.parameter(self.params["W_y"]) b_y = dy.parameter(self.params["b_y"]) s_lookup = self.params["s_lookup"] t_lookup = self.params["t_lookup"] s_batch = [x[0] for x in batch] t_batch = [x[1] for x in batch] wids = [] for i in range(len(s_batch[0])): wids.append([sent[i] for sent in s_batch]) wids_rev = list(reversed(wids)) l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for wid in wids: l2r_state = l2r_state.add_input(dy.lookup_batch(s_lookup, wid)) l2r_contexts.append(l2r_state.output()) for wid in wids_rev: r2l_state = r2l_state.add_input(dy.lookup_batch(s_lookup, wid)) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() losses = [] H_f = [] H_f = [dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts)] H_f_mat = dy.concatenate_cols(H_f) W1_att = dy.parameter(self.params["W1_att"]) w1dt = W1_att * H_f_mat t_wids = [] masks = [] num_words = 0 for i in range(len(t_batch[0])): t_wids.append([(sent[i] if len(sent) > i else self.t_vocab[EOS]) for sent in t_batch]) mask = [(1 if len(sent) > i else 0) for sent in t_batch] masks.append(mask) num_words += sum(mask) c_t = dy.vecInput(2*self.HIDDEN_DIM) words = [self.t_vocab[EOS]] * len(t_batch) embedding = dy.lookup_batch(t_lookup, words) dec_state = self.dec_builder.initial_state() for t_wid, mask in zip(t_wids, masks): x_t = dy.concatenate([c_t, embedding]) dec_state = dec_state.add_input(x_t) c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_batch[0]), len(wids[0])) probs = dy.affine_transform([b_y, W_y, dy.concatenate([c_t, dec_state.output()])]) loss = dy.pickneglogsoftmax_batch(probs, t_wid) if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(t_batch)) loss = loss * mask_expr losses.append(loss) embedding = dy.lookup_batch(t_lookup, t_wid) loss = dy.sum_batches(dy.esum(losses)) # /len(wids[0]) return loss, num_words
def static_train(self,\ train_treebank,\ validation_treebank,\ lr=0.001,\ hidden_dropout=0.01,\ batch_size=64,\ max_epochs=200,\ max_lexicon_size=9998,\ glove_file=None): """ Locally trains a model with a static oracle and a multi-task standard feedforward NN. @param train_treebank : a list of dependency trees @param validation_treebank : a list of dependency trees @param lr : learning rate @param hidden_dropout : dropout on hidden layer @param batch_size : size of mini batches @param max_epochs : max number of epochs @param max_lexicon_size : max number of entries in the lexicon @param glove_file : file where to find pre-trained word embeddings """ print("Encoding dataset from %d trees."%len(train_treebank)) #(1) build dictionaries self.code_symbols(train_treebank,lexicon_size = max_lexicon_size) #(2) encode data sets lex_train_gen , struct_train_gen = self.make_data_generators(train_treebank,batch_size) lex_dev_gen , struct_dev_gen = self.make_data_generators(validation_treebank,batch_size) print(self,flush=True) print("epochs %d\nstructural training examples [N] = %d\nlexical training examples [N] = %d\nBatch size = %d\nDropout = %f\nlearning rate = %f"%(max_epochs,struct_train_gen.N,lex_train_gen.N,batch_size,hidden_dropout,lr),flush=True) #(3) make network self.model = dy.ParameterCollection() self.hidden_weights = self.model.add_parameters((self.hidden_size,self.embedding_size*self.input_length)) self.action_weights = self.model.add_parameters((self.actions_size,self.hidden_size)) if glove_file is None: self.input_embeddings = self.model.add_parameters((self.lexicon_size,self.embedding_size)) else: self.input_embeddings = self.model.parameters_from_numpy(self.read_glove_embeddings(glove_file)) if not self.tied: self.output_embeddings = self.model.add_parameters((self.lexicon_size,self.hidden_size)) #(4) fitting lex_gen = lex_train_gen.next_batch() struct_gen = struct_train_gen.next_batch() max_batches = max( lex_train_gen.get_num_batches(), struct_train_gen.get_num_batches() ) print(lex_train_gen.get_num_batches(), struct_train_gen.get_num_batches(),flush=True) lex_valid_gen = lex_dev_gen.next_batch() struct_valid_gen = struct_dev_gen.next_batch() min_nll = float('inf') trainer = dy.AdamTrainer(self.model,alpha=lr) history_log = [] for e in range(max_epochs): struct_loss,lex_loss = 0,0 struct_N,lex_N = 0,0 start_t = time.time() for b in range(max_batches): #struct X_struct,Y_struct = next(struct_gen) #question of proportions : should struct and lex be evenly sampled or not (??): #here the parity oversamples approx twice the lexical actions dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) A = dy.parameter(self.action_weights) batched_X = zip(*X_struct) #transposes the X matrix lookups = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(lookups) ybatch_preds = dy.pickneglogsoftmax_batch(A * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_struct) loss = dy.sum_batches(ybatch_preds) struct_N += len(Y_struct) struct_loss += loss.value() loss.backward() trainer.update() #lex X_lex,Y_lex = next(lex_gen) if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) batched_X = zip(*X_lex) #transposes the X matrix lookups = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(lookups) ybatch_preds = dy.pickneglogsoftmax_batch(E * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_lex) loss = dy.sum_batches(ybatch_preds) else: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) O = dy.parameter(self.output_embeddings) batched_X = zip(*X_lex) #transposes the X matrix lookups = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(lookups) ybatch_preds = dy.pickneglogsoftmax_batch(O * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_lex) loss = dy.sum_batches(ybatch_preds) lex_N += len(Y_lex) lex_loss += loss.value() loss.backward() trainer.update() end_t = time.time() # (5) validation X_lex_valid,Y_lex_valid = lex_dev_gen.batch_all() lex_valid_nll = -sum(self.predict_logprobs(X_lex_valid,Y_lex_valid,structural=False)) X_struct_valid,Y_struct_valid = struct_dev_gen.batch_all() struct_valid_nll = -sum(self.predict_logprobs(X_struct_valid,Y_struct_valid,structural=True)) history_log.append((e,end_t-start_t,\ exp(lex_loss/lex_N),\ exp(struct_loss/struct_N),\ exp(lex_valid_nll/lex_dev_gen.N),\ exp(struct_valid_nll/struct_dev_gen.N),\ exp((lex_valid_nll+struct_valid_nll) /(struct_dev_gen.N+lex_dev_gen.N)))) print('Epoch %d (%.2f sec.) TRAIN:: PPL_lex = %f, PPL_struct = %f / VALID:: PPL_lex = %f, PPL_struct = %f, PPL_all = %f'%tuple(history_log[-1]),flush=True) if lex_valid_nll+struct_valid_nll < min_nll: df = pd.DataFrame(history_log,columns=['epoch','wall_time','ppl_lex_train','ppl_struct_train','ppl_lex_valid','ppl_struct_valid','ppl_all_valid']) self.save_model('best_model_dump',epoch = e, learning_curve=df) return pd.DataFrame(history_log,columns=['epoch','wall_time','ppl_lex_train','ppl_struct_train','ppl_lex_valid','ppl_struct_valid','ppl_all_valid'])
def train(args, network, train_batches, dev_batches, log=None): """Estimate model parameters on `train_batches` with early stopping on`dev_batches`""" # Logger log = log or util.Logger(verbose=args.verbose, flush=True) # Optimizer trainer = dy.AdamTrainer(network.pc, alpha=args.lr) # Start training log("Starting training") best_accuracy = 0 deadline = 0 running_nll = n_processed = 0 report_every = ceil(len(train_batches) / 10) # Start training for epoch in range(1, args.n_epochs + 1): # Time the epoch start_time = time.time() for batch, y in train_batches: # Renew the computation graph dy.renew_cg() # Initialize layers network.init(test=False, update=True) # Compute logits logits = network(batch) # Loss function nll = dy.mean_batches(dy.pickneglogsoftmax_batch(logits, y)) # Backward pass nll.backward() # Update the parameters trainer.update() # Keep track of the nll running_nll += nll.value() * batch.batch_size n_processed += batch.batch_size # Print the current loss from time to time if train_batches.just_passed_multiple(report_every): avg_nll = running_nll / n_processed log(f"Epoch {epoch}@{train_batches.percentage_done():.0f}%: " f"NLL={avg_nll:.3f}") running_nll = n_processed = 0 # End of epoch logging avg_nll = running_nll / n_processed log(f"Epoch {epoch}@100%: NLL={avg_nll:.3f}") log(f"Took {time.time()-start_time:.1f}s") log("=" * 20) # Validate accuracy = evaluate(args, network, dev_batches) # Print final result log(f"Dev accuracy: {accuracy*100:.2f}%") # Early stopping if accuracy > best_accuracy: best_accuracy = accuracy dynn.io.save(network.pc, args.model_file) deadline = 0 else: if deadline < args.patience: dynn.io.populate(network.pc, args.model_file) trainer.learning_rate *= args.lr_decay deadline += 1 else: log("Early stopping with best accuracy " f"{best_accuracy*100:.2f}%") break # Load best model dynn.io.populate(network.pc, args.model_file) return best_accuracy
def run(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) if isTrain or arc_targets is not None: mask_1D = dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) word_embs = [ dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK), update=True) #+ dy.lookup_batch(self.pret_word_embs, w, update = False) # remove 1 line for w in word_inputs ] tag_embs = [ dy.lookup_batch(self.tag_embs, pos, update=True) for pos in tag_inputs ] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep, head = leaky_relu(dy.affine_transform([ b_dep, W_dep, top_recur ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) # seq_len x batch_size if isTrain or arc_targets is not None: arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def run(self, word_inputs, lemma_inputs, tag_inputs, pred_golds, rel_targets=None, isTrain=True, syn_mask=None, seq_lens=None): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.PAD).astype(np.float32) num_tokens = int(np.sum(mask)) word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) for w in word_inputs ] if self.use_lm: lm_embs = np.zeros((batch_size, seq_len, self.lm_dims), dtype=float) for idx in range(batch_size): if self._unified: txt = [ self._vocab.id2word(w) for w in word_inputs[1:, idx] if self._vocab.id2word(w) != '<PAD>' ] key = ' '.join(txt) key = self.lm_dict.get(key, None) if key is None: for sidx in range(len(self.lm_sentences)): line = self.lm_sentences[sidx] if len(line) != len(txt): continue found = True for mdx in range(len(line)): if line[mdx] != txt[mdx] and txt[ mdx] != '<UNK>': found = False break if found: key = str(sidx) self.lm_dict[' '.join(txt)] = key break assert key is not None lm_embs[idx, 1:1 + len(txt), :] = self.lm_data[key][...] else: txt = [ self._vocab.id2word(w) for w in word_inputs[:, idx] if self._vocab.id2word(w) != '<PAD>' ] key = ' '.join(txt) key = self.lm_dict.get(key, None) if key is None: for sidx in range(len(self.lm_sentences)): line = self.lm_sentences[sidx] if len(line) != len(txt): continue found = True for mdx in range(len(line)): if line[mdx] != txt[mdx] and txt[ mdx] != '<UNK>': found = False break if found: key = str(sidx) self.lm_dict[' '.join(txt)] = key break assert key is not None lm_embs[idx, :len(txt), :] = self.lm_data[key][...] lm_embs = lm_embs.transpose(1, 2, 0) lm_embs = [dy.inputTensor(e, batched=True) for e in list(lm_embs)] pre_embs = [ dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs ] flag_embs = [ dy.lookup_batch(self.flag_embs, np.array(w == i + 1, dtype=np.int)) for i, w in enumerate(pred_golds) ] if self.use_lemma: lemma_embs = [ dy.lookup_batch(self.lemma_embs, lemma) for lemma in lemma_inputs ] if self.use_pos: tag_embs = [ dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs ] if self.use_lm: if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) if self.use_lemma and self.use_pos: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(lme, wm), dy.cmult(pos, posm) ]) for word, pre, flag, lemma, pos, lme, (wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, lm_embs, emb_masks) ] elif self.use_lemma: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(lme, wm) ]) for word, pre, flag, lemma, pos, lme, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, lm_embs, emb_masks) ] elif self.use_pos: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lme, wm), dy.cmult(pos, posm) ]) for word, pre, flag, pos, lme, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, tag_embs, lm_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lme, wm) ]) for word, pre, flag, lme, (wm, posm) in zip( word_embs, pre_embs, flag_embs, lm_embs, emb_masks) ] else: if self.use_lemma and self.use_pos: emb_inputs = [ dy.concatenate([word, pre, flag, lemma, lme, pos]) for word, pre, flag, lemma, lme, pos in zip( word_embs, pre_embs, flag_embs, lemma_embs, lm_embs, tag_embs) ] elif self.use_lemma: emb_inputs = [ dy.concatenate([word, pre, flag, lme, pos]) for word, pre, flag, lemma, lme, pos in zip( word_embs, pre_embs, flag_embs, lm_embs, tag_embs) ] elif self.use_pos: emb_inputs = [ dy.concatenate([word, pre, flag, lemma, lme]) for word, pre, flag, lemma, lme in zip( word_embs, pre_embs, flag_embs, lemma_embs, lm_embs) ] else: emb_inputs = [ dy.concatenate([word, pre, flag, lme]) for word, pre, flag, lme in zip( word_embs, pre_embs, flag_embs, lm_embs) ] else: if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) if self.use_lemma and self.use_pos: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(pos, posm) ]) for word, pre, flag, lemma, pos, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, emb_masks) ] elif self.use_lemma: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm) ]) for word, pre, flag, lemma, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, emb_masks) ] elif self.use_pos: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(pos, posm) ]) for word, pre, flag, pos, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm) ]) for word, pre, flag, (wm, posm) in zip( word_embs, pre_embs, flag_embs, emb_masks) ] else: if self.use_lemma and self.use_pos: emb_inputs = [ dy.concatenate([word, pre, flag, lemma, pos]) for word, pre, flag, lemma, pos in zip( word_embs, pre_embs, flag_embs, lemma_embs, tag_embs) ] elif self.use_lemma: emb_inputs = [ dy.concatenate([word, pre, flag, lemma]) for word, pre, flag, lemma in zip( word_embs, pre_embs, flag_embs, lemma_embs) ] elif self.use_pos: emb_inputs = [ dy.concatenate([word, pre, flag, pos]) for word, pre, flag, pos in zip( word_embs, pre_embs, flag_embs, tag_embs) ] else: emb_inputs = [ dy.concatenate([word, pre, flag]) for word, pre, flag in zip( word_embs, pre_embs, flag_embs) ] if self.encoder_type == 'rnn': top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) else: emb_inputs = dy.concatenate_cols(emb_inputs) emb_inputs = emb_inputs * math.sqrt(self.input_dims) emb_inputs = emb_inputs + dy.transpose( dy.inputTensor(self.pe[:seq_len])) emb_inputs = dy.transpose(emb_inputs) encoder_outputs = self.transformer(emb_inputs, src_len=seq_lens, train=isTrain) top_recur = encoder_outputs.output top_recur = dy.concatenate_cols(top_recur) #print(top_recur.dim()) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_arg, b_arg = self.mlp_arg_W.expr(), self.mlp_arg_b.expr( ) #dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b) W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter( self.mlp_pred_b) arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur])) # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur])) predicates_1D = pred_golds[0] pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1) pred_hidden = leaky_relu( dy.affine_transform([b_pred, W_pred, pred_recur])) if isTrain: arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp) # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp) pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp) W_rel = dy.parameter(self.rel_W) # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size, # num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True) # # (#pred x rel_size x #arg) x batch_size # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # # (#pred x rel_size) x (#arg x batch_size) # predicates_1D = dynet_flatten_numpy(pred_golds) # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # # (rel_size) x (#arg x batch_size) if self.use_si_droput and syn_mask is not None: syn_mask = np.expand_dims(syn_mask, axis=0) # (1, seq_len, batch_size) arg_hidden = dy.cmult(arg_hidden, dy.inputTensor(syn_mask, batched=True)) rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # if self.use_biaffine: # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size, # num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True) # else: # pred_hidden = dy.reshape(pred_hidden, (self.mlp_size, 1), batch_size) # preds_hidden = [pred_hidden for _ in xrange(seq_len)] # preds_hidden = dy.concatenate(preds_hidden, d=1) # rel_hidden = dy.concatenate([preds_hidden, arg_hidden], d=0) # (2*mlp_size x seq_len) x batch_size # flat_rel_hidden = dy.reshape(rel_hidden, (self.mlp_size*2, ), seq_len * batch_size) # W_ffn_layer1 = dy.parameter(self.ffn_layer1_W) # b_ffn_layer1 = dy.parameter(self.ffn_layer1_b) # W_ffn_layer2 = dy.parameter(self.ffn_layer2_W) # b_ffn_layer2 = dy.parameter(self.ffn_layer2_b) # flat_rel_hidden = leaky_relu(dy.affine_transform([b_ffn_layer1, W_ffn_layer1, flat_rel_hidden])) # flat_rel_hidden = leaky_relu(dy.affine_transform([b_ffn_layer2, W_ffn_layer2, flat_rel_hidden])) # flat_rel_hidden = W_rel * flat_rel_hidden # rel_logits = dy.reshape(flat_rel_hidden, (1, self._vocab.rel_size, seq_len), batch_size) # (1 x rel_size x #arg) x batch_size flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size), seq_len * batch_size) # (1 x rel_size) x (#arg x batch_size) predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0]) partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # (1 x rel_size) x (#arg x batch_size) if isTrain: mask_1D = dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens return rel_accuracy, rel_loss # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), # (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, 1, seq_len, batch_size), 'F')) outputs = [] # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): # msk[0] = 1. # sent_len = int(np.sum(msk)) # rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold] # rel_pred = rel_argmax(rel_prob) # outputs.append(rel_pred[:sent_len]) for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): msk[0] = 1. sent_len = int(np.sum(msk)) rel_prob = rel_prob[np.arange(len(pred_gold)), 0] rel_pred = rel_argmax(rel_prob) outputs.append(rel_pred[:sent_len]) return outputs
def __call__(self, inputs, masks, truth, is_train=True, is_tree=True): sent_len = len(inputs) batch_size = inputs[0].dim()[1] flat_len = sent_len * batch_size # H -> hidden size, L -> sentence length, B -> batch size # ((H, L), B) X = dy.concatenate_cols(inputs) if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP) # M_H -> MLP hidden size # ((M_H, L), B) # head_mat = leaky_relu(self.head_MLP(X, is_train)) head_mat = self.head_MLP(X, is_train) # ((M_H, L), B) dept_mat = self.dept_MLP(X, is_train) if is_train: total_token = sum(masks['flat'].tolist()) head_mat = dy.dropout_dim(head_mat, 1, self.cfg.MLP_DROP) dept_mat = dy.dropout_dim(dept_mat, 1, self.cfg.MLP_DROP) # A_H -> Arc hidden size, R_H -> Label hidden size, A_H + R_H = M_H head_arc = head_mat[:self.arc_size] # ((A_H, L), B) dept_arc = dept_mat[:self.arc_size] # ((A_H, L), B) head_rel = head_mat[self.arc_size:] # ((R_H, L), B) dept_rel = dept_mat[self.arc_size:] # ((R_H, L), B) # ((L, L), B) masks_2D = dy.inputTensor(masks['2D'], True) # (1, L*B) masks_flat = dy.inputTensor(masks['flat'], True) gnn_losses = [] for k in range(self.cfg.GRAPH_LAYERS): # Graph Weights # ((L, L), B) arc_mat = self.arc_attn_mat[k](head_arc, dept_arc) - 1e9 * (1 - masks_2D) arc_prob = dy.softmax(arc_mat) # Layer-wise Loss if is_train: # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # ((1,), L*B) arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token gnn_losses.append(arc_loss) # Aggregation Function # Fusion head and dept representation # ((A_H, L), B) HX = head_arc * arc_prob DX = dept_arc * dy.transpose(arc_prob) FX = HX + DX # Async Update Function # Head-first # ((A_H, L), B) head_arc = self.head_gnn(FX, head_arc) FX_new = head_arc * arc_prob + DX dept_arc = self.dept_gnn(FX_new, dept_arc) # ((L, L), B) arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc) - 1e9 * (1 - masks_2D) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # Predict Relation # (R_H, L*B) head_rel = dy.reshape(head_rel, (self.rel_size, flat_len)) # ((R_H,), L*B) dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len) if is_train: # ((1,), L*B) arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token # ((R_H,), L*B) truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1) # R -> Relation Set Size # ((R,), L*B) rel_mat = self.rel_attn(dept_rel, truth_rel) else: if is_tree: # MST Inference, Achieve Tree Edge. arc_probs = dy.softmax(arc_mat).npvalue() arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F') arc_probs = np.transpose(arc_probs) # Mask PAD arc_masks = [ np.array(masks['flat'][i:i + sent_len]) for i in range(0, flat_len, sent_len) ] arc_pred = [] # Inference One By One. for msk, arc_prob in zip(arc_masks, arc_probs): msk[0] = 1 seq_len = int(np.sum(msk)) tmp_pred = MST_inference(arc_prob, seq_len, msk) tmp_pred[0] = 0 arc_pred.extend(tmp_pred) else: # Greedy Inference (argmax) arc_pred = np.argmax(arc_mat.npvalue(), 0) # Pick Predicted Edge's <Head, Dept> pair. flat_pred = [ j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred) ] pred_rel = dy.pick_batch(head_rel, flat_pred, 1) # Predict Relation (mask ROOT) rel_mat = self.rel_attn(dept_rel, pred_rel) rel_mask = dy.inputTensor(self.rel_mask) rel_mat = rel_mat - 1e9 * rel_mask if is_train: # Calculate Relation Classification Loss # ((1,), L*B) rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel']) # (1,) rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token # Final Total Loss with Layer-wise losses = (rel_loss + arc_loss) * self.cfg.LAMBDA2 if gnn_losses: losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1 losses_list = gnn_losses + [arc_loss, rel_loss] return losses, losses_list else: rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue() rel_pred = np.argmax(rel_mat, 0) pred = {} pred['head'], pred['rel'] = arc_pred, rel_pred return pred
def __call__(self, inputs, masks, truth, iters, is_train=True, is_tree=True): if type(inputs) == list: sent_len = len(inputs) batch_size = inputs[0].dim()[1] X = dy.concatenate_cols(inputs) else: sent_len = inputs.dim()[0][0] batch_size = inputs.dim()[1] X = dy.transpose(inputs, [1, 0]) flat_len = sent_len * batch_size #sent_len = len(inputs) #batch_size = inputs[0].dim()[1] #flat_len = sent_len * batch_size # H -> hidden size, L -> sentence length, B -> batch size # ((H, L), B) #X = dy.concatenate_cols(inputs) if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP) # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size # ((A_H, L), B) head_arc = self.head_arc_MLP(X, is_train) dept_arc = self.dept_arc_MLP(X, is_train) # ((R_H, L), B) head_rel = self.head_rel_MLP(X, is_train) dept_rel = self.dept_rel_MLP(X, is_train) if is_train: total_token = sum(masks['flat'].tolist()) head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP) head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP) dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP) dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP) # ((L, L), B) masks_2D = 1e9*(1-dy.inputTensor(masks['2D'], True)) # (1, L*B) masks_flat = dy.inputTensor(masks['flat'], True) gnn_losses = [] arc_norm = math.sqrt(self.arc_size) rel_norm = math.sqrt(self.rel_size) for k in range(self.cfg.GRAPH_LAYERS): # Graph Weights # ((L, L), B) arc_mat = self.arc_attn_mat[k](head_arc, dept_arc)/arc_norm-masks_2D arc_prob = dy.softmax(arc_mat) # Layer-wise Loss if is_train: arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len,), flat_len) # ((1,), L*B) arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_loss*masks_flat)/total_token gnn_losses.append(arc_loss) # Aggregation Function # Fusion head and dept representation # ((A_H, L), B) HX = head_arc * arc_prob DX = dept_arc * dy.transpose(arc_prob) FX = HX + DX # Async Update Function # Head-first # ((A_H, L), B) head_arc = self.head_gnn(FX, head_arc) FX_new = head_arc * arc_prob + DX dept_arc = self.dept_gnn(FX_new, dept_arc) # Relation Aggregation Function # Sync update # ((R_H, L), B) HR = head_rel * arc_prob DR = dept_rel * dy.transpose(arc_prob) FX = HR+DR head_rel = self.head_rel_gnn(FX, head_rel) + head_rel dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel # ((L, L), B) arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc)/arc_norm-masks_2D is_tree_computed_val = is_tree_computed(arc_mat.npvalue()) print(is_tree_computed_val) exit(0) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len,), flat_len) # Predict Relation # (R_H, L*B) head_rel = dy.reshape(head_rel, (self.rel_size, flat_len)) # ((R_H,), L*B) dept_rel = dy.reshape(dept_rel, (self.rel_size,), flat_len) if is_train: # print(arc_mat.dim()) # ((3,), 300) # arc_pred = np.argmax(arc_mat.npvalue(), 0) # print(arc_pred.shape) # (300,) # print(arc_pred) # all 0's and 1's # ((1,), L*B) arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_losses*masks_flat)/total_token # ((R_H,), L*B) truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1) # R -> Relation Set Size # ((R,), L*B) rel_mask = 1e9*dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, truth_rel)/rel_norm - rel_mask # Calculate Relation Classification Loss # ((1,), L*B) rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel']) # (1,) rel_loss = dy.sum_batches(rel_losses*masks_flat) / total_token # Final Total Loss with Layer-wise warm = [int(iters>=x) for x in self.warm_list] losses = rel_loss*self.cfg.LAMBDA2*warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1] if gnn_losses: for i in range(self.cfg.GRAPH_LAYERS): gnn_losses[i] *= warm[i] losses += dy.esum(gnn_losses)*self.cfg.LAMBDA1 losses_list = gnn_losses + [arc_loss, rel_loss] return losses, losses_list else: if is_tree: # MST Inference, Achieve Tree Edge. arc_probs = dy.softmax(arc_mat).npvalue() arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F') arc_probs = np.transpose(arc_probs) # Mask PAD arc_masks = [np.array(masks['flat'][i:i+sent_len]) for i in range(0, flat_len, sent_len)] arc_pred = [] # Inference One By One. for msk, arc_prob in zip(arc_masks, arc_probs): msk[0] = 1 seq_len = int(np.sum(msk)) tmp_pred = MST_inference(arc_prob, seq_len, msk) tmp_pred[0] = 0 arc_pred.extend(tmp_pred) else: # Greedy Inference (argmax) arc_pred = np.argmax(arc_mat.npvalue(), 0) # Pick Predicted Edge's <Head, Dept> pair. flat_pred = [j+(i//sent_len)*sent_len for i, j in enumerate(arc_pred)] pred_rel = dy.pick_batch(head_rel, flat_pred, 1) # Predict Relation (mask ROOT) rel_mask = 1e9*dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, pred_rel)/rel_norm-rel_mask rel_mat = dy.reshape(rel_mat, (self.rel_num,)).npvalue() rel_pred = np.argmax(rel_mat, 0) pred = {} pred['head'], pred['rel'] = arc_pred, rel_pred return pred
def loss(self, input_, y): if self.batched: return dy.pickneglogsoftmax_batch(input_, y) return dy.pickneglogsoftmax(input_, y)
# regular lookup a = lp[1].npvalue() b = lp[2].npvalue() c = lp[3].npvalue() # batch lookup instead of single elements. # two ways of doing this. abc1 = dy.lookup_batch(lp, [1,2,3]) print(abc1.npvalue()) abc2 = lp.batch([1,2,3]) print(abc2.npvalue()) print(np.hstack([a,b,c])) # use pick and pickneglogsoftmax in batch mode # (must be used in conjunction with lookup_batch): print("\nPick") W = dy.parameter( m.add_parameters((5, 10)) ) h = W * lp.batch([1,2,3]) print(h.npvalue()) print(dy.pick_batch(h,[1,2,3]).npvalue()) print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value()) # using pickneglogsoftmax_batch print("\nPick neg log softmax") print((-dy.log(dy.softmax(h))).npvalue()) print(dy.pickneglogsoftmax_batch(h,[1,2,3]).npvalue())
def run_parser(self, word_inputs, common_top_recur, private_top_recur, arc_targets=None, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) top_recur = dy.concatenate([common_top_recur, private_top_recur]) if isTrain or arc_targets is not None: mask_1D = self.dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep = leaky_relu(dy.affine_transform([b_dep, W_dep, top_recur])) head = leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if isTrain: dep = dy.dropout_dim(dep, 1, self.dropout_mlp) head = dy.dropout_dim(head, 1, self.dropout_mlp) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) # seq_len x batch_size if isTrain or arc_targets is not None: arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = self.dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) # dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) # head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = self.dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * self.dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def run(self, char_vocab, cased_word_inputs, word_inputs, tag_inputs, arc_targets=None, rel_targets=None, is_train=True): """ Train or test :param char_vocab: :param cased_word_inputs: seq_len x batch_size :param word_inputs: seq_len x batch_size :param tag_inputs: seq_len x batch_size :param arc_targets: seq_len x batch_size :param rel_targets: seq_len x batch_size :param is_train: is training or test :return: """ def flatten_numpy(ndarray): """ Flatten nd-array to 1-d column vector :param ndarray: :return: """ return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) # non padding, non root token number if is_train or arc_targets is not None: mask_1D = flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) # if batched=True, the last dimension is used as a batch dimension if arr is a list of numpy ndarrays if self.char_lstm: # Subword model char_w = dy.parameter(self.char_w) def LSTM_attention(lstm, inputs, dropout_x=0., dropout_h=0.): ss = LSTM(lstm, inputs, None, dropout_x, dropout_h) hs = [s.h()[0] for s in ss] return dy.concatenate([attention(hs, char_w), ss[-1].s()[0]]) subword_embs = [] for char_ids in char_vocab: char_inputs = [ dy.lookup(self.char_embs, char) for char in char_ids ] subword_embs.append( LSTM_attention( self.char_lstm, char_inputs, self.dropout_lstm_input if is_train else 0., self.dropout_lstm_hidden if is_train else 0.)) subword_embs = dy.concatenate_cols(subword_embs) word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + subword_embs * dy.inputTensor(one_hot(cw, len(char_vocab)).T, batched=True) + 0 if self.pret_word_embs is None else dy.lookup_batch( self.pret_word_embs, w, update=False) for cw, w in zip(cased_word_inputs, word_inputs) ] else: word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + 0 if self.pret_word_embs is None else dy.lookup_batch( self.pret_word_embs, w, update=False) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] # Dropout if is_train: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] # seq_len x batch_size top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if is_train else 0., self.dropout_lstm_hidden if is_train else 0.)) if is_train: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep, head = leaky_relu(dy.affine_transform([ b_dep, W_dep, top_recur ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if is_train: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) if len(arc_preds.shape) == 1: # dynet did unnecessary jobs arc_preds = np.expand_dims(arc_preds, axis=1) # seq_len x batch_size if is_train or arc_targets is not None: arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not is_train: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) # dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) # head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if is_train else flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if is_train or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not is_train: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if is_train or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if is_train: return arc_accuracy * 100., rel_accuracy * 100., overall_accuracy * 100., loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy * 100., rel_accuracy * 100., overall_accuracy * 100., outputs return outputs
def create_network_return_loss(self, inputs, expected_output, dropout=False): out = self(inputs, dropout) loss = dy.pickneglogsoftmax_batch(out, expected_output) # loss = -dy.log(dy.pick(out, expected_output)) return loss
def batch_predict_next_best_action(self,config_batched,prev_action_batched,sentence_batch): """ Predicts greedily the next transition for a batch of configs, actions leading to that config,and related sentences @param config_batched: a list of configurations @param prev_action_batched: a list of actions (or None if no prev actions) @param sentence_batch: a list of sentences @return a list of new configurations, a list of actions generating these new configs """ B = len(config_batched) idxes = list(range(B)) new_configs = [None] * B new_actions = [None] * B if prev_action_batched is None: prev_action_batched = [None]*B #(1) sort out the lexical and structural batches def is_lexical(config): S,F,B,A,prefix_score = config return F is None and len(B) > 0 lexical_idxes = [idx for idx in idxes if is_lexical(config_batched[idx])] structural_idxes = [idx for idx in idxes if not is_lexical(config_batched[idx])] #(2) lexical predictions if len(lexical_idxes) > 0: def make_ref_lex_action(config,sentence): S,F,B,A,prefix_score = config return (ArcEagerGenerativeParser.GENERATE,sentence[B[0]]) X = [] Y = [] for idx in lexical_idxes: x,y = self.make_representation(config_batched[idx],make_ref_lex_action(config_batched[idx],sentence_batch[idx]),sentence_batch[idx],structural=False) X.append(x) Y.append(y) Xt = zip(*X) #transpose if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) embeddings = [dy.pick_batch(E, xcol) for xcol in Xt] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(E * dy.tanh( W * xdense ),Y).npvalue()[0] else: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) O = dy.parameter(self.output_embeddings) embeddings = [dy.pick_batch(E, xcol) for xcol in Xt] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(O * dy.tanh( W * xdense ),Y).npvalue()[0] preds = np.atleast_1d(preds) for pred_score,idx in zip(preds,lexical_idxes): new_configs[idx] = self.generate(config_batched[idx],local_score= -pred_score)# execs the actions new_actions[idx] = (ArcEagerGenerativeParser.GENERATE,sentence_batch[idx][config_batched[idx][2][0]]) #(3) structural predictions if len(structural_idxes) > 0 : action_masks = np.array([self.mask_actions(config_batched[idx],prev_action_batched[idx],len(sentence_batch[idx])) for idx in structural_idxes]) X = [self.make_representation(config_batched[idx],None,sentence_batch[idx],structural=True) for idx in structural_idxes] Xt = zip(*X) #transpose dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) A = dy.parameter(self.action_weights) embeddings = [dy.pick_batch(E, xcol) for xcol in Xt] xdense = dy.concatenate(embeddings) preds = dy.softmax(A * dy.tanh( W * xdense )).npvalue().transpose() max_idxes = np.argmax(preds * action_masks,axis=1) max_scores = np.log(preds[np.arange(preds.shape[0]),max_idxes]) for argmax_idx,max_score,idx in zip(max_idxes,max_scores,structural_idxes): new_configs[idx] = self.actions[argmax_idx](config_batched[idx],local_score=max_score) #execs the actions new_actions[idx] = self.rev_action_codes[argmax_idx] return (new_configs, new_actions)
def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size): self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # initial "input feeding" vectors to feed decoder - 3*h init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size) # initial feedback embeddings for the decoder, use begin seq symbol embedding init_feedback = dn.lookup_batch(self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size) # init decoder rnn decoder_init = dn.concatenate([init_feedback, init_input_feeding]) s = s_0.add_input(decoder_init) # loss per timestep losses = [] # run the decoder through the output sequences and aggregate loss for i, step_word_ids in enumerate(output_word_ids): # returns h x batch size matrix decoder_rnn_output = s.output() # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix) attention_output_vector, alphas = self.attend(encoded_inputs, decoder_rnn_output, input_masks) # compute output scores (returns vocab_size x batch size matrix) # h = readout * attention_output_vector + bias h = dn.affine_transform([self.bias, self.readout, attention_output_vector]) # encourage diversity by punishing highly confident predictions # TODO: support batching - esp. w.r.t. scalar inputs if self.diverse: soft = dn.softmax(dn.tanh(h)) batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \ - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4)) else: # get batch loss for this timestep batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids) # mask the loss if at least one sentence is shorter if output_masks and output_masks[i][-1] != 1: mask_expr = dn.inputVector(output_masks[i]) # noinspection PyArgumentList mask_expr = dn.reshape(mask_expr, (1,), batch_size) batch_loss = batch_loss * mask_expr # input feeding approach - input h (attention_output_vector) to the decoder # prepare for the next iteration - "feedback" feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids) decoder_input = dn.concatenate([feedback_embeddings, attention_output_vector]) s = s.add_input(decoder_input) losses.append(batch_loss) # sum the loss over the time steps and batch total_batch_loss = dn.sum_batches(dn.esum(losses)) return total_batch_loss
def run(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) if isTrain or arc_targets is not None: mask_1D = dynet_flatten_numpy(mask) # batched here means that the last dim is treated as batch dimension, both in input and output mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) # TODO: 注意 _words_in_train # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len if self.pre_train_emb: word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + dy.lookup_batch(self.pret_word_embs, w, update=False) for w in word_inputs ] # 两个 embedding 相加 [Expression] * seq_len else: word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep, head = leaky_relu(dy.affine_transform([ b_dep, W_dep, top_recur ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) # 1 就意味着某些情况下整个 dim 1 变成0, dim=0 就是 drop 列, dim=1 就是 drop 行, 第三维是 batch dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # 这种风格的平坦是为了计算 loss 啦 # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) # seq_len x batch_size if isTrain or arc_targets is not None: # 用得分最高的去计算 loss, 并不意味着我就选这个作为解码结果的哦, 但是必须削减它 arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask # mask 你真厉害呀现在还活着 arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D # 这里的形状如此, 需要用 mask1d rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by ones # 我非常赞同, parse 的解码这一部分根本没法 batch msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) # 难道第 0 个真的是 ROOT, 确实如此 if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def decode_sentences(dec_lstm, vectors, outputs): # Takes in [l*(2hE+e)*n] as input and returns [l*2hD*n] as output w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) w1_array = [] # Concatenate the columns of the BiLSTM encodings bidirectional_vectors = [] for (i,v) in enumerate(vectors): bidirectional_vectors.append(dy.concatenate_cols(v)) w1_array.append(w1) # Repeat w1 and make it a tensor w1_repeated = [[w1]] * len(vectors) if debug_dimensions: print "In Decoder" print " The dimensions of w1: ", get_tensor_size(w1.value()) print " The dimensions of w1 array: " , get_tensor_size(w1_array) print " The dimensions of v are: ", get_tensor_size(v) print " The dimensions of the bidirectional encodings is ", get_tensor_size(bidirectional_vectors) print " The dimensions of w1_repeated: ", get_tensor_size(w1_repeated) print " The dimensions of the first dimension of the w1 is : ", get_matrix_size(w1_repeated[2]) dots = char2int[EOS] dots_batch = [[dots]]*len(bidirectional_vectors) last_output_embeddings = dy.lookup(output_lookup, dots) last_output_embeddings_batch = [[last_output_embeddings]]*len(outputs[0]) concatenated_stuff = [[dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])]] * len(outputs[0]) dec_state_array = dec_lstm.initial_state().add_inputs((k[0] for k in concatenated_stuff)) if debug_dimensions: print " The dimensions of last output embeddings batch: ", get_tensor_size(last_output_embeddings_batch) print " The dimensions of the concatenated stuff: ", get_tensor_size(concatenated_stuff) loss = 0 decodings_transposed = [] masks = [] for i in range(len(outputs[0])): decodings_transposed.append([(sent[i] if len(sent)> i else output_lookup[0]) for sent in outputs]) mask = [(1 if len(sent)>i else 0) for sent in outputs] masks.append(mask) if debug: print "Transposed Decodings: ", decodings_transposed # Get w1dt w1dt_array = [] for (w,b) in zip(w1_array, bidirectional_vectors): w1dt_array.append(w*b) batch_loss = [] for (y_batch, mask) in zip(decodings_transposed, masks): attention_output = attend(bidirectional_vectors, dec_state_array, w1dt_array) if debug_dimensions: print "Back in Decoder" print " The dimensions of the concatenated stuff: ", get_tensor_size(concatenated_stuff) print " The dimensions of the attention output: ", get_tensor_size(attention_output) #print " The dimensions of Dec state : ", get_tensor_size(dec_state) print dec_state_array vector_array = [] for (a,b) in zip(attention_output, concatenated_stuff): vector = dy.concatenate([a,b[0]]) vector_array.append(vector) for (dec_state, vector) in zip(dec_state_array, vector_array): dec_state.add_input(vector) out_vectors_array = [] for dec_state in dec_state_array: out_vectors = w * dec_state.output() out_vectors_array.append(out_vectors) print out_vectors_array loss = dy.pickneglogsoftmax_batch(out_vectors, y_batch) batch_loss.append(loss) return dy.esum(batch_loss)
def train_nn_lm(self,\ train_sentences,\ validation_sentences,\ lr=0.001,\ hidden_dropout=0.1,\ batch_size=100,\ max_epochs=100,\ glove_file=None): """ Locally trains a model with a static oracle and a standard feedforward NN. @param train_sentences : a list of sentences @param validation_sentences : a list of sentences @return learning curves for various metrics as a pandas dataframe """ #(1) build dictionaries self.code_symbols(train_sentences) print("Dictionaries built.") #(2) read off treebank and builds data set print("Encoding dataset from %d sentences." % len(train_sentences)) training_generator = self.make_data_generator(train_sentences, batch_size) validation_generator = self.make_data_generator( validation_sentences, batch_size) print(self, flush=True) print( "max_epochs = %d\ntraining examples [N] = %d\nBatch size = %d\nDropout = %f\nlearning rate = %f" % (max_epochs, training_generator.N, batch_size, hidden_dropout, lr), flush=True) #(3) Model structure self.model = dy.ParameterCollection() self.hidden_weights = self.model.add_parameters( (self.hidden_size, self.embedding_size * self.input_length)) if glove_file is None: self.embedding_matrix = self.model.add_parameters( (self.lexicon_size, self.embedding_size)) else: self.embedding_matrix = self.model.parameters_from_numpy( self.read_glove_embeddings(glove_file)) if not self.tied: self.output_weights = self.model.add_parameters( (self.lexicon_size, self.hidden_size)) #fitting xgen = training_generator.next_batch() trainer = dy.AdamTrainer(self.model, alpha=lr) min_nll = float('inf') history_log = [] for e in range(max_epochs): L = 0 N = 0 start_t = time.time() for b in range(training_generator.get_num_batches()): X, Y = next(xgen) if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.embedding_matrix) batched_X = zip(*X) #transposes the X matrix lookups = [ dy.pick_batch(E, xcolumn) for xcolumn in batched_X ] xdense = dy.concatenate(lookups) ybatch_preds = dy.pickneglogsoftmax_batch( E * dy.dropout(dy.tanh(W * xdense), hidden_dropout), Y) loss = dy.sum_batches(ybatch_preds) else: dy.renew_cg() O = dy.parameter(self.output_weights) W = dy.parameter(self.hidden_weights) E = dy.parameter(self.embedding_matrix) batched_X = zip(*X) #transposes the X matrix lookups = [ dy.pick_batch(E, xcolumn) for xcolumn in batched_X ] xdense = dy.concatenate(lookups) ybatch_preds = dy.pickneglogsoftmax_batch( O * dy.dropout(dy.tanh(W * xdense), hidden_dropout), Y) loss = dy.sum_batches(ybatch_preds) N += len(Y) L += loss.value() loss.backward() trainer.update() end_t = time.time() #validation and auto-saving Xvalid, Yvalid = validation_generator.batch_all() valid_nll = -sum(self.predict_logprobs(Xvalid, Yvalid)) valid_ppl = exp(valid_nll / len(Yvalid)) history_log.append( (e, end_t - start_t, L, exp(L / N), valid_nll, valid_ppl)) print( 'Epoch %d (%.2f sec.) NLL (train) = %f, PPL (train) = %f, NLL(valid) = %f, PPL(valid) = %f' % tuple(history_log[-1]), flush=True) if valid_nll == min(valid_nll, min_nll): min_nll = valid_nll lc = pd.DataFrame(history_log, columns=[ 'epoch', 'wall_time', 'NLL(train)', 'PPL(train)', 'NLL(dev)', 'PPL(dev)' ]) self.save_model('best_model_dump', epoch=e, learning_curve=lc) return pd.DataFrame(history_log, columns=[ 'epoch', 'wall_time', 'NLL(train)', 'PPL(train)', 'NLL(dev)', 'PPL(dev)' ])
def _loss(outputs, labels): losses = [dy.pickneglogsoftmax_batch(out, label) for out, label in zip(outputs, labels)] loss = dy.mean_batches(dy.average(losses)) return loss
def BuildLMGraph_batch(self, sents, sent_args=None): dynet.renew_cg() init_state = self.rnn.initial_state() mb_size = len(sents) #MASK SENTENCES wids = [] # Dimension: maxSentLength * minibatch_size # List of lists to store whether an input is # present(1)/absent(0) for an example at a time step masks = [] # Dimension: maxSentLength * minibatch_size #No of words processed in this batch tot_words = 0 maxSentLength = max([len(sent) for sent in sents]) for k in range(maxSentLength): wids.append([(self.vocab.s2t[sent[k]] if len(sent) > k else self.vocab.END_TOK) for sent in sents]) mask = [(1 if len(sent) > k else 0) for sent in sents] masks.append(mask) tot_words += sum(mask) R = dynet.parameter(self.R) bias = dynet.parameter(self.bias) losses = [] # will hold losses state = init_state spellings = [] # list of lists containing spellings of the word for (mask, curr_words, next_words) in zip(masks, wids, wids[1:]): # print curr_words # print next_words maxWordLen = max([len(word.s) for word in curr_words]) wordLengths = [len(word.s) for word in curr_words] for k in range(maxWordLen): spellings.append([ (self.s2s.src_vocab[word.s[k].upper()].i if len(word.s) > k else self.s2s.src_vocab.END_TOK.i) for word in curr_words ]) spellings_rev = list(reversed(spellings)) embedded_spellings = self.s2s.embed_batch_seq(spellings) embedded_spellings_rev = self.s2s.embed_batch_seq(spellings_rev) pron_vectors = self.s2s.encode_batch_seq(embedded_spellings, embedded_spellings_rev, wordLengths)[-1] fpv = dynet.nobackprop(pron_vectors) curr_words_idx = [word.i for word in curr_words] curr_words_lookup = dynet.lookup_batch(self.lookup, curr_words_idx) temp = dynet.concatenate([curr_words_lookup, fpv]) x_t = temp state = state.add_input(x_t) y_t = state.output() r_t = bias + (R * y_t) next_words_idx = [word.i for word in next_words] loss = dynet.pickneglogsoftmax_batch(r_t, next_words_idx) # loss is a list of losses # mask the loss if at least one sentence is shorter if 0 in mask: mask_expr = dynet.inputVector(mask) mask_expr = dynet.reshape(mask_expr, (1, ), mb_size) loss = loss * mask_expr losses.append(loss) netloss = dynet.sum_batches(dynet.esum(losses)) return netloss
def train_model( net, model_name, max_sentence_length, parsed = False, restart = None, ): print("Maximum sentence length is set to "+str(max_sentence_length)) # load pre-parsed data if parsed: with open("data/snli2/training_parsed.pkl", "rb") as fin: training_data = pickle.load(fin) training_total = len(training_data) training_data = [(l, s1, s2) for l, s1, s2, ls1, ls2 in training_data if ls1 <= max_sentence_length and ls1 > 1 and ls2 <= max_sentence_length and ls2 > 1] with open("data/snli2/dev_parsed.pkl", "rb") as fin: dev_data = pickle.load(fin) num_batches = len(training_data) print("Training data contains "+str(num_batches) + " batches (originally "+str(training_total)+") of size 1") # or load raw data else: with open("data/snli2/training.pkl", "rb") as fin: training_data = pickle.load(fin) training_total = len(training_data) training_data = [(l, s1, s2) for l, s1, s2 in training_data if len(s1) <= max_sentence_length and len(s1) > 1 and len(s2) <= max_sentence_length and len(s2) > 1] with open("data/snli2/dev.pkl", "rb") as fin: dev_data = pickle.load(fin) dev_data = [(l, s1, s2) for l, s1, s2 in dev_data if len(s1) <= max_sentence_length and len(s1) > 1 and len(s2) <= max_sentence_length and len(s2) > 1] num_batches = len(training_data) batch_size = len(training_data[0][0]) print("Training data contains "+str(num_batches) + " batches (originally "+str(training_total)+") of size "+str(batch_size)) classifier = networks.SNLIClassifier(model, net.hidden_dim) trainer = dy.SimpleSGDTrainer(model, e0=0.01) # hyperparameters report_frequency = 500 validate_frequency = num_batches // 10 if parsed: report_frequency = 500 * 16 start_time = time() last_validated = None last_reported = None best_validation = 0 validations = [] validation_means = [] avg_window_size = 5 patience = 12 frustration = 0 early_stop = False epoch = 0 batches_seen = 0 if isinstance(restart, int): model.load(model_name) epoch = restart batches_seen = epoch * num_batches print("Restarting interrupted training from epoch "+str(epoch)) while True: print("Start of epoch #"+str(epoch)) for batch_num, data in enumerate(training_data): dy.renew_cg() ls, s1, s2 = data if parsed: output1 = net.do_parse_tree(s1) output2 = net.do_parse_tree(s2) else: output1, _ = net(s1) output2, _ = net(s2) predicted_labels = classifier(output1, output2) if parsed: loss = dy.pickneglogsoftmax(predicted_labels, ls) else: loss = dy.sum_batches(dy.pickneglogsoftmax_batch(predicted_labels, ls)) # optimise loss.forward() loss.backward() trainer.update() # Evaluate on development data if batches_seen % validate_frequency == 0 and last_validated != batches_seen: last_validated = batches_seen acc = eval_nli_dataset(net, classifier, dev_data, parsed) validations.append(acc) validation_means.append(np.mean(validations[-avg_window_size:])) print("Validation: accuracy "+str(acc)+", moving average "+str(validation_means[-1])) if acc >= best_validation: best_validation = acc model.save(model_name) print("(model saved)") frustration = 0 # Write to log file with open(model_name+".log", "a") as flog: prog = batches_seen if parsed: prog = batches_seen / 16 flog.write(str(prog)+"\t"+str(acc)+"\n") # Decide if it's time to stop if len(validation_means) > patience and validation_means[-1] <= np.array(validation_means[:-patience]).max(): frustration += 1 if frustration > patience: print("Early stop!") early_stop = True break else: frustration = 0 # Report progress if batches_seen % report_frequency == 0 and last_reported != batches_seen: last_reported = batches_seen fraction_done = batch_num / num_batches elapsed_minutes = (time() - start_time)/60.0 # Update temperature if isinstance(net, networks.CYK): net.inv_temp = (float(epoch) + fraction_done)*100.0 + 1.0 # max(1.0 / pow(2.0, float(epoch) + fraction_done), 0.005) print( "Processed "+str(round(fraction_done*100,2))+"% "+ "of epoch #"+str(epoch)+ " after "+str(round(elapsed_minutes))+" mins"+ (", inv. temp. "+str(net.inv_temp) if isinstance(net, networks.CYK) else "") ) batches_seen += 1 if early_stop: break epoch += 1 print("Training "+str(model_name)+" finished.")
def step(self, instances): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W1_att_f = dy.parameter(self.W1_att_f) W1_att_e = dy.parameter(self.W1_att_e) w2_att = dy.parameter(self.w2_att) #instances : a list [(src0,tgt0),(src1,tgt1),(src2,tgt2)] maxLen = max(map(lambda x: len(x[1]), instances)) src_sents = [] src_sents_rev = [] tgt_sents = [] srcSenLen = len( instances[0][0]) + 2 #the length of the src sentence, all the same tgtSenLen = maxLen + 1 masks = [ [] for i in range(tgtSenLen) ] #mask for each position. each item in this list is a list with length=batchsize num_words = 0 for item in instances: #item[0]:src ; item[1]:tgt num_words += (len(item[1]) + 1) padNum = maxLen - len(item[1]) for i in range(len(item[1]) + 1): masks[i].append(1) for i in range(len(item[1]) + 1, tgtSenLen): masks[i].append(0) thisSrc = [startSymbol] + item[0] + [endSymbol] src_sents.append(thisSrc) src_sents_rev.append(list(reversed(thisSrc))) thisTgt = item[1] + [endSymbol for i in range(padNum + 1)] tgt_sents.append(thisTgt) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for i in range(srcSenLen): batchSrc = dy.lookup_batch( self.src_lookup, [self.src_token_to_id[x[i]] for x in src_sents]) batchSrc_rev = dy.lookup_batch( self.src_lookup, [self.src_token_to_id[x[i]] for x in src_sents_rev]) l2r_state = l2r_state.add_input(batchSrc) r2l_state = r2l_state.add_input(batchSrc_rev) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() # Combine the left and right representations for every word h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) losses = [] # Decoder c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate([ dy.lookup_batch(self.tgt_lookup, [self.tgt_token_to_id['<S>'] for i in tgt_sents]), c_t ]) dec_state = self.dec_builder.initial_state().add_input(start) loss = dy.pickneglogsoftmax_batch( W_y * dec_state.output() + b_y, [self.tgt_token_to_id[tgt_sent[0]] for tgt_sent in tgt_sents]) losses.append(loss) for i in range(tgtSenLen - 1): #cw : item[i] nw:item[i+1] h_e = dec_state.output() c_t = self.__attention_mlp(h_fs_matrix, h_e)[0] # Get the embedding for the current target word embed_t = dy.lookup_batch( self.tgt_lookup, [self.tgt_token_to_id[tgt_sent[i]] for tgt_sent in tgt_sents]) # Create input vector to the decoder x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) loss = dy.pickneglogsoftmax_batch(W_y * dec_state.output() + b_y, [ self.tgt_token_to_id[tgt_sent[i + 1]] for tgt_sent in tgt_sents ]) thisMask = dy.inputVector(masks[i + 1]) thisMask = dy.reshape(thisMask, (1, ), len(instances)) losses.append(loss * thisMask) return dy.sum_batches(dy.esum(losses)), num_words
def __step_batch(self, batch): dy.renew_cg() W_s = dy.parameter(self.W_s) b_s = dy.parameter(self.b_s) W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W_m = dy.parameter(self.W_m) b_m = dy.parameter(self.b_m) W1_att_f = dy.parameter(self.W1_att_f) w2_att = dy.parameter(self.w2_att) src_batch = [x[0] for x in batch] tgt_batch = [x[1] for x in batch] batch_size = len(src_batch) attended_batch = [] for src_sent in src_batch: attended = [] c_t_sense = dy.vecInput(self.embed_size) sense_start = dy.concatenate([ self.lookup_frozen(self.src_lookup, self.src_token_to_id['<S>'][0]), dy.tanh(c_t_sense) ]) sense_state = self.sense_builder.initial_state().add_input( sense_start) for cw in src_sent: cw_sense_ids = self.src_token_to_id[cw] cw_senses = [ self.lookup_frozen(self.src_lookup, sense_id) for sense_id in cw_sense_ids ] h_senses = dy.concatenate_cols(cw_senses) h_m = sense_state.output() c_t_sense = self.__sense_attention_mlp(h_senses, h_m) sense_state = sense_state.add_input( dy.concatenate([c_t_sense, dy.tanh(c_t_sense)])) attended.append(c_t_sense) attended_batch.append(attended) attended_batch_rev = [list(reversed(sent)) for sent in attended_batch] # Encoder src_cws_l2r = [] src_cws_r2l = [] src_len = [len(sent) for sent in attended_batch] max_src_len = np.max(src_len) for i in range(max_src_len): src_cws_l2r.append([sent[i] for sent in attended_batch]) src_cws_r2l.append([sent[i] for sent in attended_batch_rev]) l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for i, (cws_l2r, cws_r2l) in enumerate(zip(src_cws_l2r, src_cws_r2l)): l2r_batch = dy.reshape(dy.concatenate_cols(cws_l2r), (self.embed_size, ), batch_size=batch_size) l2r_state = l2r_state.add_input(l2r_batch) r2l_batch = dy.reshape(dy.concatenate_cols(cws_r2l), (self.embed_size, ), batch_size=batch_size) r2l_state = r2l_state.add_input(r2l_batch) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) fixed_attentional_component = W1_att_f * h_fs_matrix losses = [] num_words = 0 # Decoder tgt_cws = [] tgt_len = [len(sent) for sent in tgt_batch] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([ self.tgt_token_to_id[sent[i]] if len(sent) > i else self.tgt_token_to_id['</S>'] for sent in tgt_batch ]) mask = [(1 if len(sent) > i else 0) for sent in tgt_batch] masks.append(mask) num_words += sum(mask) c_t = dy.vecInput(self.hidden_size * 2) start_state = dy.affine_transform([b_s, W_s, h_fs[-1]]) dec_state = self.word_dec_builder.initial_state().set_s( [start_state, dy.tanh(start_state)]) for i, (cws, nws, mask) in enumerate(zip(tgt_cws, tgt_cws[1:], masks)): embed_t = dy.lookup_batch(self.tgt_lookup, cws) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) h_e = dec_state.output() c_t = self.__word_attention_mlp(h_fs_matrix, h_e, fixed_attentional_component) m_t = dy.tanh( dy.affine_transform([b_m, W_m, dy.concatenate([h_e, c_t])])) y_star = dy.affine_transform([b_y, W_y, m_t]) loss = dy.pickneglogsoftmax_batch(y_star, nws) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1, ), len(batch)) mask_loss = loss * mask_expr losses.append(mask_loss) return dy.sum_batches(dy.esum(losses)), num_words
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [ dy.concatenate([x.output(), y.output()]) for x, y in LSTM_SRC.add_inputs( [dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws]) ] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append( [sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s( [src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input( dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh( dy.affine_transform( [b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1, ), len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def __call__(self, inputs, masks, truth, iters, is_train=True, is_tree=True): sent_len = len(inputs) batch_size = inputs[0].dim()[1] flat_len = sent_len * batch_size print('===Vào call===') print('input length: ', inputs.__len__()) # input length: 46 print('input dim: ', inputs[1].dim()) # input dim: ((400,), 2) print('sent_len', sent_len) # sent_len 46 print('batch_size', batch_size) # batch_size 2 print('flat_len', flat_len) # flat_len 92 # H -> hidden size, L -> sentence length, B -> batch size # ((H, L), B) X = dy.concatenate_cols(inputs) print('X dim: ', X.dim()) # X dim: ((400, 46), 2) if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP) # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size # ((A_H, L), B) head_arc = self.head_arc_MLP(X, is_train) dept_arc = self.dept_arc_MLP(X, is_train) print('head_arc dim: ', head_arc.dim()) print('dept_arc dim: ', dept_arc.dim()) # head_arc dim: ((300, 46), 2) # dept_arc dim: ((300, 46), 2) # ((R_H, L), B) head_rel = self.head_rel_MLP(X, is_train) dept_rel = self.dept_rel_MLP(X, is_train) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # head_rel dim: ((100, 46), 2) # dept_rel dim: ((100, 46), 2) if is_train: total_token = sum(masks['flat'].tolist()) head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP) head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP) dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP) dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP) # ((L, L), B) masks_2D = 1e9 * (1 - dy.inputTensor(masks['2D'], True)) masks_flat = dy.inputTensor(masks['flat'], True) gnn_losses = [] arc_norm = math.sqrt(self.arc_size) rel_norm = math.sqrt(self.rel_size) for k in range(self.cfg.GRAPH_LAYERS): print('----layer-----', k) # Graph Weights # ((L, L), B) arc_mat = self.arc_attn_mat[k](head_arc, dept_arc) / arc_norm - masks_2D arc_prob = dy.softmax(arc_mat) # arc_mat dim: ((46, 46), 2) # arc_prob dim: ((46, 46), 2) # Layer-wise Loss if is_train: arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # ((1,), L*B) print('arc_mat val', arc_mat.value()) print('arc_mat dim', arc_mat.dim()) print("truth['head'] value", truth['head']) print("truth['head'] lengt", truth['head'].__len__()) arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) print('arc_loss', arc_loss.value()) print('arc_loss', arc_loss.dim()) # (1,) arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token print('arc_loss', arc_loss.value) print('arc_loss', arc_loss.dim()) gnn_losses.append(arc_loss.value()) input("pause") # Aggregation Function # Fusion head and dept representation # ((A_H, L), B) HX = head_arc * arc_prob DX = dept_arc * dy.transpose(arc_prob) FX = HX + DX print('HX dim: ', HX.dim()) print('DX dim: ', DX.dim()) print('FX dim: ', FX.dim()) # HX dim: ((300, 46), 2) # DX dim: ((300, 46), 2) # FX dim: ((300, 46), 2) # Async Update Function # Head-first # ((A_H, L), B) head_arc = self.head_gnn(FX, head_arc) FX_new = head_arc * arc_prob + DX dept_arc = self.dept_gnn(FX_new, dept_arc) print('head_arc dim: ', head_arc.dim()) print('FX_new dim: ', FX_new.dim()) print('dept_arc dim: ', dept_arc.dim()) # head_arc dim: ((300, 46), 2) # FX_new dim: ((300, 46), 2) # dept_arc dim: ((300, 46), 2) # Relation Aggregation Function # Sync update # ((R_H, L), B) HR = head_rel * arc_prob DR = dept_rel * dy.transpose(arc_prob) FX = HR + DR head_rel = self.head_rel_gnn(FX, head_rel) + head_rel dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel print('HR dim: ', HR.dim()) print('DR dim: ', DR.dim()) print('FX dim: ', FX.dim()) # HR dim: ((100, 46), 2) # DR dim: ((100, 46), 2) # FX dim: ((100, 46), 2) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # head_rel dim: ((100, 46), 2) # dept_rel dim: ((100, 46), 2) # ((L, L), B) arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc) / arc_norm - masks_2D # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # Predict Relation # (R_H, L*B) head_rel = dy.reshape(head_rel, (self.rel_size, flat_len)) # ((R_H,), L*B) dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len) print('arc_mat dim: ', arc_mat.dim()) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # arc_mat dim: ((46,), 92) # head_rel dim: ((100, 92), 1) # dept_rel dim: ((100,), 92) if is_train: # ((1,), L*B) arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token # ((R_H,), L*B) truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1) # R -> Relation Set Size # ((R,), L*B) rel_mask = 1e9 * dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, truth_rel) / rel_norm - rel_mask # Calculate Relation Classification Loss # ((1,), L*B) rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel']) # (1,) rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token # Final Total Loss with Layer-wise warm = [int(iters >= x) for x in self.warm_list] losses = rel_loss*self.cfg.LAMBDA2 * \ warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1] if gnn_losses: for i in range(self.cfg.GRAPH_LAYERS): gnn_losses[i] *= warm[i] losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1 losses_list = gnn_losses + [arc_loss, rel_loss] return losses, losses_list else: if is_tree: # MST Inference, Achieve Tree Edge. arc_probs = dy.softmax(arc_mat).npvalue() arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F') arc_probs = np.transpose(arc_probs) # Mask PAD arc_masks = [ np.array(masks['flat'][i:i + sent_len]) for i in range(0, flat_len, sent_len) ] arc_pred = [] # Inference One By One. for msk, arc_prob in zip(arc_masks, arc_probs): msk[0] = 1 seq_len = int(np.sum(msk)) tmp_pred = MST_inference(arc_prob, seq_len, msk) tmp_pred[0] = 0 arc_pred.extend(tmp_pred) else: # Greedy Inference (argmax) arc_pred = np.argmax(arc_mat.npvalue(), 0) # Pick Predicted Edge's <Head, Dept> pair. flat_pred = [ j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred) ] pred_rel = dy.pick_batch(head_rel, flat_pred, 1) # Predict Relation (mask ROOT) rel_mask = 1e9 * dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, pred_rel) / rel_norm - rel_mask rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue() rel_pred = np.argmax(rel_mat, 0) pred = {} pred['head'], pred['rel'] = arc_pred, rel_pred return pred
def step_batch(self, batch, lang): dy.renew_cg() W_y = dy.parameter(self.W_y[lang]) b_y = dy.parameter(self.b_y[lang]) W1_att_e = dy.parameter(self.W1_att_e) W1_att_f = dy.parameter(self.W1_att_f) w2_att = dy.parameter(self.w2_att) M_s = self.src_lookup M_t = self.tgt_lookup[lang] src_sent, tgt_sent = zip(*batch) src_sent = zip(*src_sent) tgt_sent = zip(*tgt_sent) src_sent_rev = list(reversed(src_sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(src_sent, src_sent_rev): l2r_state = l2r_state.add_input(dy.lookup_batch(M_s, cw_l2r)) r2l_state = r2l_state.add_input(dy.lookup_batch(M_s, cw_r2l)) l2r_contexts.append(l2r_state.output()) # [<S>, x_1, x_2, ..., </S>] r2l_contexts.append(r2l_state.output()) # [</S> x_n, x_{n-1}, ... <S>] # encoded_h1 = l2r_state.output() # tem1 = encoded_h1.npvalue() r2l_contexts.reverse() # [<S>, x_1, x_2, ..., </S>] # Combine the left and right representations for every word h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) encoded_h = h_fs[-1] h_fs_matrix = dy.concatenate_cols(h_fs) # h_fs_matrix_t = dy.transpose(h_fs_matrix) losses = [] num_words = 0 # Decoder c_t = dy.vecInput(self.hidden_size * 2) c_t.set([0 for i in xrange(self.contextsize)]) encoded_h = dy.concatenate([encoded_h]) dec_state = self.dec_builder[lang].initial_state([encoded_h]) for (cw, nw) in zip(tgt_sent[0:-1], tgt_sent[1:]): embed = dy.lookup_batch(M_t, cw) dec_state = dec_state.add_input(dy.concatenate([embed, c_t])) h_e = dec_state.output() #calculate attention ''' a_t = h_fs_matrix_t * h_e alignment = dy.softmax(a_t) c_t = h_fs_matrix * alignment''' c_t = self.__attention_mlp_batch(h_fs_matrix, h_e, W1_att_e, W1_att_f, w2_att) ind_tem = dy.concatenate([h_e, c_t]) ind_tem1 = W_y * ind_tem ind_tem2 = ind_tem1 + b_y loss = dy.pickneglogsoftmax_batch(ind_tem2, nw) # to modify losses.append(loss) num_words += 1 return dy.sum_batches(dy.esum(losses)), num_words
def _loss(outputs, labels): losses = [dy.pickneglogsoftmax_batch(out, label) for out, label in zip(outputs, labels)] loss = dy.mean_batches(dy.average(losses)) return loss