def evaluate(self, inputs, train=False): """ Apply all MLP layers to concatenated input :param inputs: (key, vector) per feature type :param train: are we training now? :return: output vector of size self.output_dim """ input_keys, inputs = list(map(list, zip(*list(inputs)))) if self.input_keys: assert input_keys == self.input_keys, "Got: %s\nBut expected input keys: %s" % ( self.input_keys_str(self.input_keys), self.input_keys_str(input_keys)) else: self.input_keys = input_keys if self.gated: gates = self.params.get("gates") if gates is None: # FIXME attention weights should not be just parameters, but based on biaffine product? gates = self.params["gates"] = self.model.add_parameters((len(inputs), self.gated), init=dy.UniformInitializer(1)) input_dims = [i.dim()[0][0] for i in inputs] max_dim = max(input_dims) x = dy.concatenate_cols([dy.concatenate([i, dy.zeroes(max_dim - d)]) # Pad with zeros to get uniform dim if d < max_dim else i for i, d in zip(inputs, input_dims)]) * gates # Possibly multiple "attention heads" -- concatenate outputs to one vector inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1],))] x = dy.concatenate(inputs) assert len(x.dim()[0]) == 1, "Input should be a vector, but has dimension " + str(x.dim()[0]) dim = x.dim()[0][0] if self.input_dim: assert dim == self.input_dim, "Input dim mismatch: %d != %d" % (dim, self.input_dim) else: self.init_params(dim) self.config.print(self, level=4) if self.total_layers: if self.weights is None: self.weights = [[self.params[prefix + str(i)] for prefix in ("W", "b")] for i in range(self.total_layers)] if self.weights[0][0].dim()[0][1] < dim: # number of columns in W0 self.weights[0][0] = dy.concatenate_cols([self.weights[0][0], self.params["W0+"]]) for i, (W, b) in enumerate(self.weights): self.config.print(lambda: x.npvalue().tolist(), level=4) try: if train and self.dropout: x = dy.dropout(x, self.dropout) x = self.activation()(W * x + b) except ValueError as e: raise ValueError("Error in evaluating layer %d of %d" % (i + 1, self.total_layers)) from e self.config.print(lambda: x.npvalue().tolist(), level=4) return x
def generate(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent #get the output of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] attention_matrix = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) attention_matrix.append(alignment) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent, dy.concatenate_cols(attention_matrix).value()
def cache_encoder(self, context_vectors): """Cache transformations to the encoder vectors. :param context_vectors: list[dy.Expression] The encoder output vectors we do attention over. List is of lengths T and expression are ((H,), B) """ self.context = dy.concatenate_cols(context_vectors) # ((H, T), B)
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def generate(in_seq, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): embedded = embed_sentence(in_seq) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 for i in range(len(in_seq)*2): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_char = probs.index(max(probs)) last_output_embeddings = output_lookup[next_char] if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def cache_encoder(self, context_vectors): """Cache transformations to the encoder vectors. This also projects the context vectors into a new space :param context_vectors: list[dy.Expression] The encoder output vectors we do attention over. List is of lengths T and expression are ((H,), B) """ self.context = dy.concatenate_cols(context_vectors) # ((H, T), B) self.context_proj = self.encoder * self.context # ((H, T), B)
def __call__(self, encoder_output, dst, train): embed_out_th_b = self.tgt_embedding.encode(dst) embed_out_ht_b = dy.transpose(embed_out_th_b) embed_out_ht_b = self.proj_to_hsz(embed_out_ht_b) context = dy.concatenate_cols(encoder_output.output) T = embed_out_ht_b.dim()[0][1] dst_mask = subsequent_mask(T) src_mask = encoder_output.src_mask output = self.transformer_decoder(embed_out_ht_b, context, src_mask, dst_mask, train) output = self.proj_to_dsz(output) return self.output(output)
def _step(self, loader, update, log, reporting_fns, verbose=None): steps = len(loader) pg = create_progress_bar(steps) cm = ConfusionMatrix(self.labels) epoch_loss = 0 epoch_div = 0 preds, losses, ys = [], [], [] dy.renew_cg() for i, batch_dict in enumerate(pg(loader), 1): inputs = self.model.make_input(batch_dict) y = inputs.pop('y') pred = self.model.forward(inputs) preds.append(pred) loss = self.model.loss(pred, y) losses.append(loss) ys.append(y) if i % self.autobatchsz == 0: loss = dy.average(losses) preds = dy.concatenate_cols(preds) batchsz = len(losses) lossv = loss.npvalue().item() * batchsz epoch_loss += lossv epoch_div += batchsz _add_to_cm(cm, np.array(ys), preds.npvalue()) update(loss) log(self.optimizer.global_step, lossv, batchsz, reporting_fns) preds, losses, ys = [], [], [] dy.renew_cg() loss = dy.average(losses) preds = dy.concatenate_cols(preds) batchsz = len(losses) epoch_loss += loss.npvalue().item() * batchsz epoch_div += batchsz _add_to_cm(cm, np.array(ys), preds.npvalue()) update(loss) metrics = cm.get_all_metrics() metrics['avg_loss'] = epoch_loss / float(epoch_div) verbose_output(verbose, cm) return metrics
def calc_loss(sents): dy.renew_cg() src_fwd = LSTM_SRC_FWD.initial_state() src_bwd = LSTM_SRC_BWD.initial_state() trg_fwd = LSTM_TRG_FWD.initial_state() trg_bwd = LSTM_TRG_BWD.initial_state() # Encoding src_reps = encode_sents(LOOKUP_SRC, src_fwd, src_bwd, [src for src, trg in sents]) trg_reps = encode_sents(LOOKUP_TRG, trg_fwd, trg_bwd, [trg for src, trg in sents]) # Concatenate the sentence representations to a single matrix mtx_src = dy.concatenate_cols(src_reps) mtx_trg = dy.concatenate_cols(trg_reps) # Do matrix multiplication to get a matrix of dot product similarity scores sim_mtx = dy.transpose(mtx_src) * mtx_trg # Calculate the hinge loss over all dimensions loss = dy.hinge_dim(sim_mtx, list(range(len(sents))), d=1) return dy.sum_elems(loss)
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols(src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify(dy.affine_transform([b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify(dy.affine_transform([b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify(dy.affine_transform([b_label_hidden_to_head, W_label_hidden_to_head, src_encodings])) h_label_dep = dy.rectify(dy.affine_transform([b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def generate(self, sentence): #embedded = embed_sentence(in_seq) encoded = self.encode_sentence(sentence) w = dy.parameter(self.decoder_w) b = dy.parameter(self.decoder_b) w1 = dy.parameter(self.attention_w1) input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = self.output_lookup[2] s = self.dec_lstm.initial_state().add_input( dy.concatenate( [dy.vecInput(self.state_size * 2), last_output_embeddings])) out = '' res = [] count_EOS = 0 for i in range(len(sentence)): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate( [self.attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) #k = s #dloss = self.test_duration(k, i, b) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_word = probs.index(max(probs)) last_output_embeddings = self.output_lookup[next_word] if next_word == 2: count_EOS += 1 continue res.append(next_word) #out += int2char[next_word] return res
def prediction(self): """Adds the core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the transformation is a linear layer plus a softmax transformation: y = softmax(xW + b) Args: input_data: A tensor of shape (batch_size, n_features). Returns: pred: A tensor of shape (batch_size, n_classes) """ W = dy.parameter(self._pW) b = dy.parameter(self._pb) x = dy.inputTensor(self.input) z_m = x * W z_T = dy.concatenate_cols([z_m[i]+b for i in range(self.config.batch_size)]) z = dy.transpose(z_T) # z = x * W + b pred = softmax(z) return pred
def generate(in_seq, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): embedded = embed_sentence(in_seq) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input( dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 # For checking likelihood of entire output string max_probs_list = [] for i in range(len(in_seq) * 2): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate( [attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() max_probs_list.append(max(probs)) next_char = probs.index(max(probs)) last_output_embeddings = output_lookup[next_char] if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out, prod(max_probs_list)
def make_decoder(self, in_seq, **kwargs): """ Creates a decoder generator to be used as co-routine to the decoding procedure. It has two steps (i) it first output a probability dist on the vocabulary and (ii) it accepts a symbol to be fedback into the generation of the next symbol. See self.generate and self. """ embedded = self._embed_seq(in_seq) enc_mat = dy.concatenate_cols(self.encode(embedded)) # variables to compute and cache the encoder projection onto att space enc2att = dy.parameter(self.enc2att) encatt = None # EOS as zero-vector for 1st step last_char_emb = self.lookup[self.char2int[u.EOS]] # init hidden state of decoder should take last encoding hidden state state_vec = dy.vecInput(self.enc_hid_dim) if self.add_pred: init = dy.concatenate([state_vec, last_char_emb]) else: init = state_vec s = self.dec_rnn.initial_state().add_input(init) while True: # (maybe) project encoding hidden seq onto attention space encatt = encatt or enc2att * enc_mat # create a new decoding state (s) combining previous decoding step, # encoded seq (output of encoder) and ev. last input encoding new_state = self.recur(s, enc_mat, last_char_emb, encatt, **kwargs) s = s.add_input(new_state) # TODO: according to Bahdanau 2015, the new state is computed with # deep output + single maxout: # p(y_i | s_i, y_{i-1}, c_i) \prop exp(y_i^T * W_o * t_i) # where $t_i = [max(t^~_{i, 2j-1}, t^~_{i, 2j})]^T_{j=1,...,l}$ # and $t^~_i = U_o * s_{i-1} + V_o * Ey_{i-1} * C_o * c_i$ yield self._output_softmax(s.output()) last_char = yield last_char_emb = self.lookup[last_char]
def beam_generate(in_seq, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): embedded = embed_sentence(in_seq) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(encoded) w1dt = None histories = [[0.0,output_lookup[char2int[EOS]],'',None]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), histories[0][1]])) histories[0][3] = s count_EOS = 0 for i in range(len(in_seq)*2): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase presents = [] for ll, embedding, out, s in histories: w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), embedding]) new_s = s.add_input(vector) out_vector = w * new_s.output() + b probs = dy.softmax(out_vector).vec_value() probs = sorted([(prob,j) for j, prob in enumerate(probs)], key = lambda x:x[0], reverse=1) for prob, j in probs: next_embedding = output_lookup[j] presents.append([ll + log(prob), next_embedding, out + int2char[j], new_s]) presents.sort(reverse=1,key=lambda x:x[0]) histories = presents[:BEAM] if presents[0][2].endswith(EOS) and presents[0][2] != EOS: return presents[0][2].replace(EOS,'') return histories[0][2].replace(EOS,'')
def attend(self, encoded_inputs, h_t, input_masks=None): # encoded_inputs dimension is: seq len x 2*h x batch size, h_t dimension is h x batch size (for bilstm encoder) if len(encoded_inputs) == 1: # no need to attend if only one input state, compute output directly h_output = dn.tanh(self.w_c * dn.concatenate([h_t, encoded_inputs[0]])) # return trivial alphas (all 1's since one input gets all attention) if input_masks: # if batching alphas = dn.inputTensor([1]*len(input_masks[0]), batched=True) else: alphas = dn.inputTensor([1], batched=True) return h_output, alphas # iterate through input states to compute attention scores # scores = [v_a * dn.tanh(w_a * h_t + u_a * h_input) for h_input in blstm_outputs] w_a_h_t = self.w_a * h_t scores = [self.v_a * dn.tanh(dn.affine_transform([w_a_h_t, self.u_a, h_input])) for h_input in encoded_inputs] concatenated = dn.concatenate(scores) if input_masks: # if batching, multiply attention scores with input masks to zero-out scores for padded inputs dn_masks = dn.inputTensor(input_masks, batched=True) concatenated = dn.cmult(concatenated, dn_masks) # normalize scores alphas = dn.softmax(concatenated) # compute context vector with weighted sum for each seq in batch bo = dn.concatenate_cols(encoded_inputs) c = bo * alphas # c = dn.esum([h_input * dn.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # compute output vector using current decoder state and context vector h_output = dn.tanh(self.w_c * dn.concatenate([h_t, c])) return h_output, alphas
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() # convolutional layer src = padding(src, src.dim()[0][0], src.dim()[0][1], self.filter_width, self.stride, src.dim()[1]) l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filter_conv), stride = [self.stride, self.stride], is_valid = True)) timestep = l1.dim()[0][1] features = l1.dim()[0][2] batch_size = l1.dim()[1] # transpose l1 to be (timesetp, dim), but keep the batch_size. rhn_in = dy.reshape(l1, (timestep, features), batch_size = batch_size) rhn_in = [dy.pick(rhn_in, i) for i in range(timestep)] for l in range(self.rhn_num_hidden_layers): rhn_out = [] # initialize a random vector for the first state vector, keep the same batch size. prev_state = dy.parameter(self.init[l]) # begin recurrent high way network for t in range(timestep): for m in range(0, self.rhn_microsteps): H = dy.affine_transform([dy.parameter(self.recur[l][m][1]), dy.parameter(self.recur[l][m][0]), prev_state]) T = dy.affine_transform([dy.parameter(self.recur[l][m][3]), dy.parameter(self.recur[l][m][2]), prev_state]) if m == 0: H += dy.parameter(self.linear[l][0]) * rhn_in[t] T += dy.parameter(self.linear[l][1]) * rhn_in[t] H = dy.tanh(H) T = dy.logistic(T) prev_state = dy.cmult(1 - T, prev_state) + dy.cmult(T, H) # ((1024, ), batch_size) rhn_out.append(prev_state) if self.residual and l>0: rhn_out = [sum(x) for x in zip(rhn_out, rhn_in)] rhn_in = rhn_out # Compute the attention-weighted average of the activations rhn_in = dy.concatenate_cols(rhn_in) scores = dy.transpose(dy.parameter(self.attention[0][1]))*dy.tanh(dy.parameter(self.attention[0][0])*rhn_in) # ((1,510), batch_size) scores = dy.reshape(scores, (scores.dim()[0][1],), batch_size = scores.dim()[1]) attn_out = rhn_in*dy.softmax(scores) # # rhn_in.as_tensor() is ((1024,510), batch_size) softmax is ((510,), batch_size) return ExpressionSequence(expr_tensor = attn_out)
def decode(vectors, output): output = [EOS] + list(output) + [EOS] output = [char2id[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2id[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def generate(i, s, id2char): """ Generate a word form for the lemma at position i in sentence s. """ context = get_context(i,s) embedded = embed(s[i][LEMMA],context) encoded = encode(embedded) in_seq = s[i][LEMMA] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = output_lookup[char2id[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate( [dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 for i in range(len(in_seq)*2): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_char = probs.index(max(probs)) last_output_embeddings = output_lookup[next_char] if id2char[next_char] == EOS: count_EOS += 1 continue out += id2char[next_char] return out
def decode(self, encoded, output_words, output_tags, output_index, masks): input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = dy.lookup_batch(self.wlookup, output_words[0]) last_tag_embeddings = dy.lookup_batch(self.tlookup, output_tags[0]) empty_tensor = dy.reshape(dy.inputTensor(np.zeros((self.options.hdim * 2, len(output_words[0])), dtype=float)), (self.options.hdim * 2,), len(output_words[0])) s = self.dec_lstm.initial_state().add_input(dy.concatenate([empty_tensor, last_output_embeddings, last_tag_embeddings])) loss = [] for p, word in enumerate(output_words): # w1dt can be computed and cached once for the entire decoding phase mask_tensor = dy.reshape(dy.inputTensor(masks[p]), (1,), len(masks[p])) w1dt = w1dt or self.attention_w1.expr() * input_mat att_weights = self.attend(s, w1dt, True) vector = dy.concatenate([input_mat * att_weights, last_output_embeddings, last_tag_embeddings]) if self.options.dropout > 0: vector = dy.dropout(vector, self.options.dropout) s = s.add_input(vector) last_output_embeddings = dy.lookup_batch(self.wlookup, word) last_tag_embeddings = dy.lookup_batch(self.tlookup, output_tags[p]) loss_p = dy.cmult(dy.pick_batch(-dy.log(att_weights), output_index[p]), mask_tensor) loss.append(dy.sum_batches(loss_p)/loss_p.dim()[1]) return loss
def translate_sentence(self, sent): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W1_att_f = dy.parameter(self.W1_att_f) W1_att_e = dy.parameter(self.W1_att_e) w2_att = dy.parameter(self.w2_att) sent = [startSymbol] + sent + [endSymbol] sent_rev = list(reversed(sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(sent, sent_rev): l2r_state = l2r_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r])) r2l_state = r2l_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l])) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) # Decoder trans_sentence = [startSymbol] cw = trans_sentence[-1] #initial context c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate( [dy.lookup(self.tgt_lookup, self.tgt_token_to_id[endSymbol]), c_t]) dec_state = self.dec_builder.initial_state().add_input(start) i = 0 while len(trans_sentence) < self.max_len: i += 1 h_e = dec_state.output() getAttention = self.__attention_mlp(h_fs_matrix, h_e) c_t = getAttention[0] embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw]) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) y_star = dy.softmax(W_y * dec_state.output() + b_y).vec_value() next_wordID = np.argmax(y_star) if i == 1: #print y_star[next_wordID] pass cw = self.tgt_id_to_token[next_wordID] cpcw = cw if i < 5: #print (i,cw) pass if cw == unkSymbol: #find the source word with highest attention score keyWord = sent[getAttention[1]] if self.src_token_to_id[keyWord] == self.src_token_to_id[ unkSymbol]: cw = keyWord #special word . simply pass it source word out #print (i,cw,1) else: #find the target word with second max prob #prob: y_star next_wordID = np.argpartition(y_star, 1)[1] cw = self.tgt_id_to_token[next_wordID] #print (i,cw,2) if cw == endSymbol: break if cw != startSymbol: trans_sentence.append(cw) cw = cpcw return ' '.join(trans_sentence[1:])
def rnn_mlp(self, sens): ''' Here, I assumed all sens have the same length. ''' words, pwords, pos, chars = sens[0], sens[1], sens[2], sens[5] # words: indices of words in wlookup. # words shape: sent_length x batch_size (length x batch) if self.options.use_char: cembed = [dy.lookup_batch(self.clookup, c) for c in chars] char_fwd, char_bckd = self.char_lstm.builder_layers[0][0].initial_state().transduce(cembed)[-1],\ self.char_lstm.builder_layers[0][1].initial_state().transduce(reversed(cembed))[-1] crnn = dy.reshape(dy.concatenate_cols([char_fwd, char_bckd]), (self.options.we, words.shape[0] * words.shape[1])) cnn_reps = [list() for _ in range(len(words))] for i in range(words.shape[0]): cnn_reps[i] = dy.pick_batch( crnn, [i * words.shape[1] + j for j in range(words.shape[1])], 1) wembed = [ dy.lookup_batch(self.wlookup, words[i]) + dy.lookup_batch(self.elookup, pwords[i]) + cnn_reps[i] for i in range(len(words)) ] else: wembed = [ dy.lookup_batch(self.wlookup, words[i]) + dy.lookup_batch(self.elookup, pwords[i]) for i in range(len(words)) ] posembed = [ dy.lookup_batch(self.plookup, pos[i]) for i in range(len(pos)) ] if self.options.use_pos else None inputs = [dy.concatenate([w, pos]) for w, pos in zip(wembed, posembed) ] if self.options.use_pos else wembed h_out = self.bi_rnn(inputs, words.shape[1], 0, 0) #self.deep_lstms.transduce(inputs) # h_out: python list of concatenated BiLSTM hidden state # BiLSTM hidden tape (python list --> dynet tensor) h = dy.concatenate_cols(h_out) # shape: batch x ( 2*rnn x len ) # arc-head H = self.activation( dy.affine_transform( [self.arc_mlp_head_b.expr(), self.arc_mlp_head.expr(), h])) # arc-modifier M = self.activation( dy.affine_transform( [self.arc_mlp_dep_b.expr(), self.arc_mlp_dep.expr(), h])) # arc-head for label HL = self.activation( dy.affine_transform( [self.label_mlp_head_b.expr(), self.label_mlp_head.expr(), h])) # arc-modifier for label ML = self.activation( dy.affine_transform( [self.label_mlp_dep_b.expr(), self.label_mlp_dep.expr(), h])) return h, H, M, HL, ML
def run(self, triple, isTrain): MLP = dy.parameter(self.MLP) MLP_bias = dy.parameter(self.MLP_bias) MLP_attn = dy.parameter(self.MLP_attn) MLP_attn_bias = dy.parameter(self.MLP_attn_bias) attn_weight = dy.parameter(self.attn_weight) classifier = dy.parameter(self.classifier) classifier_bias = dy.parameter(self.classifier_bias) s, t, f = triple s = [BOW] + s + [EOW] t = [BOW] + t + [EOW] char_embs = [self.lp_c[c] for c in s] top_recur = utils.biLSTM(self.LSTM_builders, char_embs, dropout_h=self._pdrop_lstm if isTrain else 0., dropout_x=self._pdrop_lstm if isTrain else 0.) key = dy.concatenate_cols(top_recur[1:-1]) feat_embs = [] for idx in range(len(self.lp_feats)): if idx < len(f): feat_embs.append(self.lp_feats[idx][f[idx]]) else: feat_embs.append(dy.inputVector(np.zeros(self._feat_dim))) feat_embs = dy.concatenate(feat_embs) prev_char = BOW pred_word = [] losses = [] prev_top_recur = dy.inputVector(np.zeros(self._hidden_dim)) state = self.dec_LSTM.initial_state() idx = 0 while prev_char != EOW: tmp = dy.concatenate( [self.lp_c[prev_char], feat_embs, prev_top_recur]) if isTrain: tmp = dy.dropout(tmp, self._pdrop_embs) h = dy.affine_transform([MLP_attn_bias, MLP_attn, tmp]) if isTrain: h = dy.dropout(h, self._pdrop_mlp) query = dy.cmult(attn_weight, dy.rectify(h)) attn_vec = dy.softmax(dy.transpose(key) * query) value = key * attn_vec inp = dy.concatenate([value, tmp]) inp = dy.affine_transform([MLP_bias, MLP, inp]) h = state.add_input(inp).output() top_recur = dy.rectify(h) if isTrain: top_recur = dy.dropout(top_recur, self._pdrop_mlp) prev_top_recur = h score = dy.affine_transform( [classifier_bias, classifier, top_recur]) if isTrain: losses.append(dy.pickneglogsoftmax(score, t[idx + 1])) prev_char = t[idx + 1] idx += 1 else: pred_char = score.npvalue().argmax() pred_word.append(pred_char) prev_char = pred_char if len(pred_word) > 30: break return pred_word, losses
def beam_search(self, pre_context, pos_context, entity, beam): embedded = self.embed_sentence(pre_context) pre_encoded = self.encode_sentence(self.encpre_fwd_lstm, self.encpre_bwd_lstm, embedded) embedded = self.embed_sentence(pos_context) pos_encoded = self.encode_sentence(self.encpos_fwd_lstm, self.encpos_bwd_lstm, embedded) w = dy.parameter(self.decoder_w) b = dy.parameter(self.decoder_b) w1_pre = dy.parameter(self.attention_w1_pre) h_pre = dy.concatenate_cols(pre_encoded) w1dt_pre = None w1_pos = dy.parameter(self.attention_w1_pos) h_pos = dy.concatenate_cols(pos_encoded) w1dt_pos = None try: entity_embedding = self.input_lookup[self.input2int[entity]] except: entity_embedding = self.input_lookup[self.input2int[self.EOS]] last_output_embeddings = self.output_lookup[self.output2int[self.EOS]] s = self.dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(self.STATE_SIZE*2), last_output_embeddings, entity_embedding])) candidates = [{'sentence':[self.EOS], 'prob':0.0, 'count_EOS':0, 's':s}] outputs = [] i = 0 while i < self.config['GENERATION'] and len(outputs) < beam: new_candidates = [] for candidate in candidates: if candidate['count_EOS'] == 2: outputs.append(candidate) if len(outputs) == beam: break else: # w1dt can be computed and cached once for the entire decoding phase w1dt_pre = w1dt_pre or w1_pre * h_pre w1dt_pos = w1dt_pos or w1_pos * h_pos attention_pre = self.attend(h_pre, candidate['s'], w1dt_pre, self.attention_w2_pre, self.attention_v_pre) attention_pos = self.attend(h_pos, candidate['s'], w1dt_pos, self.attention_w2_pos, self.attention_v_pos) last_output_embeddings = self.output_lookup[self.output2int[candidate['sentence'][-1]]] vector = dy.concatenate([self.hier_attend(attention_pre, attention_pos, candidate['s']), last_output_embeddings, entity_embedding]) s = candidate['s'].add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_words = [{'prob':e, 'index':probs.index(e)} for e in sorted(probs, reverse=True)[:beam]] for next_word in next_words: word = self.int2output[next_word['index']] new_candidate = { 'sentence': candidate['sentence'] + [word], 'prob': candidate['prob'] + np.log(next_word['prob']), 'count_EOS': candidate['count_EOS'], 's':s } if word == self.EOS: new_candidate['count_EOS'] += 1 new_candidates.append(new_candidate) candidates = sorted(new_candidates, key=lambda x: x['prob'], reverse=True)[:beam] i += 1 if len(outputs) == 0: outputs = candidates # Length Normalization alpha = 0.6 for output in outputs: length = len(output['sentence']) lp_y = ((5.0 + length)**alpha) / ((5.0+1.0)**alpha) output['prob'] = output['prob'] / lp_y outputs = sorted(outputs, key=lambda x: x['prob'], reverse=True) return list(map(lambda x: x['sentence'], outputs))
def encode(self, embed_list): embed_list = dy.transpose(dy.concatenate_cols(embed_list)) return [ self.output(out) for out in self.encoder(embed_list, self.train) ]
def encode(self, embed_list): embed_list = dy.transpose(dy.concatenate_cols(embed_list)) return [self.output(out) for out in self.encoder(embed_list, self.train)]
def cache_encoder(self, context_vectors): """Cache the context vectors and project them into a new spaace.""" self.context = dy.concatenate_cols(context_vectors) # ((H, T), B) self.context_proj = self.A * self.context # ((H, T), B)
def step(self, instances): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W1_att_f = dy.parameter(self.W1_att_f) W1_att_e = dy.parameter(self.W1_att_e) w2_att = dy.parameter(self.w2_att) #instances : a list [(src0,tgt0),(src1,tgt1),(src2,tgt2)] maxLen = max(map(lambda x: len(x[1]), instances)) src_sents = [] src_sents_rev = [] tgt_sents = [] srcSenLen = len( instances[0][0]) + 2 #the length of the src sentence, all the same tgtSenLen = maxLen + 1 masks = [ [] for i in range(tgtSenLen) ] #mask for each position. each item in this list is a list with length=batchsize num_words = 0 for item in instances: #item[0]:src ; item[1]:tgt num_words += (len(item[1]) + 1) padNum = maxLen - len(item[1]) for i in range(len(item[1]) + 1): masks[i].append(1) for i in range(len(item[1]) + 1, tgtSenLen): masks[i].append(0) thisSrc = [startSymbol] + item[0] + [endSymbol] src_sents.append(thisSrc) src_sents_rev.append(list(reversed(thisSrc))) thisTgt = item[1] + [endSymbol for i in range(padNum + 1)] tgt_sents.append(thisTgt) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for i in range(srcSenLen): batchSrc = dy.lookup_batch( self.src_lookup, [self.src_token_to_id[x[i]] for x in src_sents]) batchSrc_rev = dy.lookup_batch( self.src_lookup, [self.src_token_to_id[x[i]] for x in src_sents_rev]) l2r_state = l2r_state.add_input(batchSrc) r2l_state = r2l_state.add_input(batchSrc_rev) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() # Combine the left and right representations for every word h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) losses = [] # Decoder c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate([ dy.lookup_batch(self.tgt_lookup, [self.tgt_token_to_id['<S>'] for i in tgt_sents]), c_t ]) dec_state = self.dec_builder.initial_state().add_input(start) loss = dy.pickneglogsoftmax_batch( W_y * dec_state.output() + b_y, [self.tgt_token_to_id[tgt_sent[0]] for tgt_sent in tgt_sents]) losses.append(loss) for i in range(tgtSenLen - 1): #cw : item[i] nw:item[i+1] h_e = dec_state.output() c_t = self.__attention_mlp(h_fs_matrix, h_e)[0] # Get the embedding for the current target word embed_t = dy.lookup_batch( self.tgt_lookup, [self.tgt_token_to_id[tgt_sent[i]] for tgt_sent in tgt_sents]) # Create input vector to the decoder x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) loss = dy.pickneglogsoftmax_batch(W_y * dec_state.output() + b_y, [ self.tgt_token_to_id[tgt_sent[i + 1]] for tgt_sent in tgt_sents ]) thisMask = dy.inputVector(masks[i + 1]) thisMask = dy.reshape(thisMask, (1, ), len(instances)) losses.append(loss * thisMask) return dy.sum_batches(dy.esum(losses)), num_words
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def translate_sentence(self, sent, lang): dy.renew_cg() W_y = dy.parameter(self.W_y[lang]) b_y = dy.parameter(self.b_y[lang]) W1_att_e = dy.parameter(self.W1_att_e) W1_att_f = dy.parameter(self.W1_att_f) w2_att = dy.parameter(self.w2_att) M_s = self.src_lookup M_t = self.tgt_lookup[lang] src_sent = sent src_sent_rev = list(reversed(sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(src_sent, src_sent_rev): l2r_state = l2r_state.add_input(M_s[cw_l2r]) r2l_state = r2l_state.add_input(M_s[cw_r2l]) l2r_contexts.append(l2r_state.output()) # [<S>, x_1, x_2, ..., </S>] r2l_contexts.append(r2l_state.output()) # [</S> x_n, x_{n-1}, ... <S>] r2l_contexts.reverse() # [<S>, x_1, x_2, ..., </S>] # Combine the left and right representations for every word h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) encoded_h = h_fs[-1] h_fs_matrix = dy.concatenate_cols(h_fs) # h_fs_matrix_t = dy.transpose(h_fs_matrix) # Decoder trans_sentence = [u'<s>'] cw = self.tgt_vocab[lang][u'<s>'] c_t = dy.vecInput(self.hidden_size * 2) c_t.set([0 for i in xrange(self.contextsize)]) dec_state = self.dec_builder[lang].initial_state([encoded_h]) while len(trans_sentence) < self.max_len: embed = dy.lookup(M_t,cw) dec_state = dec_state.add_input(dy.concatenate([embed, c_t])) h_e = dec_state.output() # c_t = self.__attention_mlp(h_fs_matrix, h_e) c_t = self.__attention_mlp(h_fs_matrix, h_e, W1_att_e, W1_att_f, w2_att) # calculate attention ''' a_t = h_fs_matrix_t * h_e alignment = dy.softmax(a_t) c_t = h_fs_matrix * alignment''' ind_tem = dy.concatenate([h_e, c_t]) ind_tem1 = W_y * ind_tem ind_tem2 = ind_tem1 + b_y score = dy.softmax(ind_tem2) probs1 = score.npvalue() cw = np.argmax(probs1) if cw == self.tgt_vocab[lang][u'</s>']: break trans_sentence.append(self.rtgt_vocab[lang][cw]) return trans_sentence[1:]
def run(self, word_inputs, lemma_inputs, tag_inputs, pred_golds, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1,), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] marker = self._vocab.PAD if self._unified else self._vocab.DUMMY mask = np.greater(word_inputs, marker).astype(np.float32) num_tokens = int(np.sum(mask)) word_embs = [dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK) ) for w in word_inputs] pre_embs = [dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs] flag_embs = [dy.lookup_batch(self.flag_embs, np.array(w == i + 1, dtype=np.int) ) for i, w in enumerate(pred_golds)] lemma_embs = [dy.lookup_batch(self.lemma_embs, lemma) for lemma in lemma_inputs] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [dy.concatenate([dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(pos, posm)]) for word, pre, flag, lemma, pos, (wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, emb_masks)] else: emb_inputs = [dy.concatenate([word, pre, flag, lemma, pos]) for word, pre, flag, lemma, pos in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs)] top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_arg, b_arg = dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b) W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter(self.mlp_pred_b) arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur])) # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur])) predicates_1D = pred_golds[0] pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1) pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, pred_recur])) if isTrain: arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp) # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp) pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp) W_rel = dy.parameter(self.rel_W) # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size, # num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True) # # (#pred x rel_size x #arg) x batch_size # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # # (#pred x rel_size) x (#arg x batch_size) # predicates_1D = dynet_flatten_numpy(pred_golds) # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # # (rel_size) x (#arg x batch_size) rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (1 x rel_size x #arg) x batch_size flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size), seq_len * batch_size) # (1 x rel_size) x (#arg x batch_size) predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0]) partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # (1 x rel_size) x (#arg x batch_size) if isTrain: mask_1D = dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype(np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens return rel_accuracy, rel_loss # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), # (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, 1, seq_len, batch_size), 'F')) outputs = [] # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): # msk[0] = 1. # sent_len = int(np.sum(msk)) # rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold] # rel_pred = rel_argmax(rel_prob) # outputs.append(rel_pred[:sent_len]) for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): msk[0] = 1. sent_len = int(np.sum(msk)) rel_prob = rel_prob[np.arange(len(pred_gold)), 0] rel_pred = rel_argmax(rel_prob) outputs.append(rel_pred[:sent_len]) return outputs
def get_bert_embed(self, passage, lang, train=False): orig_tokens = passage bert_tokens = [] # Token map will be an int -> int mapping between the `orig_tokens` index and # the `bert_tokens` index. orig_to_tok_map = [] # Example: # orig_tokens = ["John", "Johanson", "'s", "house"] # bert_tokens == ["[CLS]", "john", "johan", "##son", "'", "s", "house", "[SEP]"] # orig_to_tok_map == [(1), (2,3), (4,5), (6)] bert_tokens.append("[CLS]") for orig_token in orig_tokens: start_token = len(bert_tokens) bert_token = self.tokenizer.tokenize(orig_token) bert_tokens.extend(bert_token) end_token = start_token + len(bert_token) orig_to_tok_map.append(slice(start_token, end_token)) bert_tokens.append("[SEP]") indexed_tokens = self.tokenizer.convert_tokens_to_ids(bert_tokens) tokens_tensor = self.torch.tensor([indexed_tokens]) if self.config.args.bert_gpu: tokens_tensor = tokens_tensor.to('cuda') with self.torch.no_grad(): encoded_layers, _ = self.bert_model(tokens_tensor) assert len( encoded_layers ) == self.bert_layers_count, "Invalid BERT layer count %s" % len( encoded_layers) aligned_layer = [] for layer in range(self.bert_layers_count): aligned_layer.append([]) for mapping_range in orig_to_tok_map: token_embeddings = encoded_layers[layer][0][mapping_range] if self.config.args.bert_token_align_by == "mean": aligned_layer[layer].append( self.torch.mean(token_embeddings, dim=(0, )).cpu().data.numpy()) elif self.config.args.bert_token_align_by == "sum": aligned_layer[layer].append( self.torch.sum(token_embeddings, dim=(0, )).cpu().data.numpy()) elif self.config.args.bert_token_align_by == "first": aligned_layer[layer].append( token_embeddings[0].cpu().data.numpy()) else: raise ValueError("Invalid BERT token align option '%s'" % self.config.args.bert_token_align_by) layer_list_to_use = self.config.args.bert_layers aligned_layer = [aligned_layer[i] for i in layer_list_to_use] if self.config.args.bert_layers_pooling == "weighted": bert_softmax = dy.softmax(self.params["bert_weights"]) embeds = dy.cmult(dy.inputTensor(np.asarray(aligned_layer)), bert_softmax) embeds = dy.sum_dim(embeds, [0]) elif self.config.args.bert_layers_pooling == "concat": embeds = dy.inputTensor(np.concatenate(aligned_layer, axis=1)) elif self.config.args.bert_layers_pooling == "sum": embeds = dy.inputTensor(np.sum(aligned_layer, axis=0)) else: raise ValueError("Invalid BERT pooling option '%s'" % self.config.args.bert_layers_pooling) if self.config.args.bert_multilingual == 0: assert lang if (lang + "_embed") in self.params: lang_embed = self.params[lang + "_embed"] else: lang_embed = self.model.add_parameters(50, init='glorot') self.params[lang + "_embed"] = lang_embed multilingual_embeds = [] for embed in embeds: multilingual_embeds.append(dy.concatenate([lang_embed, embed])) embeds = dy.transpose(dy.concatenate_cols(multilingual_embeds)) if self.config.args.bert_layers_pooling == "weighted": single_token_embed_len = self.bert_embedding_len elif self.config.args.bert_layers_pooling == "concat": single_token_embed_len = self.bert_embedding_len * len( layer_list_to_use) elif self.config.args.bert_layers_pooling == "sum": single_token_embed_len = self.bert_embedding_len else: raise ValueError("Invalid BERT pooling option '%s'" % self.config.args.bert_layers_pooling) if self.config.args.bert_multilingual == 0: single_token_embed_len += 50 # TODO: try dropout strategies like dropping at the per layer embeddings or dropping entire layers. assert embeds.dim() == ((len(passage), single_token_embed_len), 1), "Invalid BERT dim %s" % embeds.dim() assert 0 <= self.config.args.bert_dropout < 1, "Invalid BERT dropout %s" % self.config.args.bert_dropout if train: embeds = dy.dropout(embeds, self.config.args.bert_dropout) return embeds
e = e1[k:v] # same e = dy.pickneglogsoftmax( e1, k) # k is unsigned integer. equiv to: (pick(-log(dy.softmax(e1)), k)) # Neural net stuff dy.noise( e1, stddev ) # add a noise to each element from a gausian with standard-dev = stddev dy.dropout(e1, p) # apply dropout with probability p # functions over lists of expressions e = dy.esum([e1, e2, ...]) # sum e = dy.average([e1, e2, ...]) # average e = dy.concatenate_cols( [e1, e2, ...] ) # e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...]) e = dy.concatenate([e1, e2, ...]) # concatenate e = dy.affine_transform([e0, e1, e2, ...]) # e = e0 + ((e1*e2) + (e3*e4) ...) ## Loss functions e = dy.squared_distance(e1, e2) e = dy.l1_distance(e1, e2) e = dy.huber_distance(e1, e2, c=1.345) # e1 must be a scalar that is a value between 0 and 1 # e2 (ty) must be a scalar that is a value between 0 and 1 # e = ty * log(e1) + (1 - ty) * log(1 - e1) e = dy.binary_log_loss(e1, e2)
def translate_sentence(self, sent): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) W1_att_f = dy.parameter(self.W1_att_f) W1_att_e = dy.parameter(self.W1_att_e) w2_att = dy.parameter(self.w2_att) sent = [startSymbol] + sent + [endSymbol] sent_rev = list(reversed(sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(sent, sent_rev): l2r_state = l2r_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r])) r2l_state = r2l_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l])) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) # Decoder trans_sentence1 = [startSymbol] trans_sentence2 = [startSymbol] cw1 = trans_sentence1[-1] cw2 = trans_sentence2[-1] #initial context c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate( [dy.lookup(self.tgt_lookup, self.tgt_token_to_id[endSymbol]), c_t]) init_state = self.dec_builder.initial_state().add_input(start) def generate_top_n(logProb, state, words, wordID, n): if words[-1] == endSymbol: yield logProb, words h_e = state.output() c_t, unkIndex = self.__attention_mlp(h_fs_matrix, h_e) embed_t = dy.lookup(self.tgt_lookup, wordID) x_t = dy.concatenate([embed_t, c_t]) next_state = state.add_input(x_t) y_star = np.reshape( dy.softmax(W_y * next_state.output() + b_y).npvalue(), -1) for nextWordID in np.argpartition(-y_star, n)[n]: currentWord = self.tgt_id_to_token[nextWordID] if currentWord == unkSymbol: currentWord = self.src_id_to_token[unkIndex] currentLogProb = logProb + np.log(y_star[nextWordID]) newWords = words + [currentWord] yield currentLogProb, generate_top_n(currentProb, newWords, nextWordID, n), newWords beamSize = 2 trans = [] currentBeam = [(0, generate_top_n(0, init_state, [startSymbol], self.tgt_token_to_id[startSymbol], beamSize), [startSymbol])] remainStep = self.max_len + 2 while not trans and remainStep > 0: nextBeam = [] while currentBeam: _, maxProbStep, _ = heappop(currentBeam) for next in maxProbStep: if isinstance(next[1], GeneratorType): heappush(nextBeam, next) else: trans.append(next) while len(nextBeam) > beamSize: heappop(nextBeam) currentBeam = nextBeam if trans: trans_sentence = max(trans)[-1][1:] else: trans_sentence = max(currentBeam)[-1][1:-1] return ' '.join(trans_sentence)
def attend(self, context, x): context_cols = dy.concatenate_cols(context) context_emb = dy.max_dim(context_cols, 1) return context_emb, None
def run(self, word_inputs, lengths, tag_inputs, arc_targets=None, rel_targets=None, isTrain=True): batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = (np.broadcast_to(np.reshape(np.arange(seq_len), (seq_len, 1)), (seq_len, batch_size)) < lengths).astype( np.float32) mask[0] = 0. num_tokens = int(np.sum(mask)) if isTrain or arc_targets is not None: mask_1D = self.dynet_flatten_numpy(mask) # batched here means that the last dim is treated as batch dimension, both in input and output mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) # TODO: 注意 _words_in_train # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len if self.e_ext is not None: word_embs = [ dy.lookup_batch( self.e_form, np.where(w < self.v_train, w, self.vocab_form.stoi["<unk>"])) + dy.lookup_batch(self.e_ext, w, update=False) for w in word_inputs ] # 两个 embedding 相加 [Expression] * seq_len else: word_embs = [ dy.lookup_batch( self.e_form, np.where(w < self.v_train, w, self.vocab_form.stoi["<unk>"])) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.e_tag, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_msk(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] top_recur = dy.concatenate_cols( biLSTM(self.lstm_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: # drop some dim for lstm_output for all words, all sentences top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) dep = leaky_relu( dy.affine_transform([self.mlp_dep_b, self.mlp_dep_W, top_recur])) head = leaky_relu( dy.affine_transform([self.mlp_head_b, self.mlp_head_W, top_recur])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) # drop dim k means, it is possible that the whole dim k is set to zeros # for matrix with batch, ((R, C), B) # drop dim 0 means drop some cols, drop dim 1 means drop some rows # drop 2 means drop some batches, and it only supports for Tensor with rank <=3 dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] arc_logits = bilinear(dep_arc, self.arc_W, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # flatten it to compute loss # (#head ) x (#dep x batch_size) arc_preds = np.reshape(arc_logits.npvalue().argmax(0), (seq_len, batch_size)) # seq_len x batch_size # here if an Expression's batch size is 1 # npvalue() will drop the batch dimension # so add it back if needed if isTrain or arc_targets is not None: # tarin it in a neg log likelihood fashion, but enforce tree constraint when testing arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask # mask is used to filter <pad>'s out in summing loss arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = self.dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head, transpose reverse all, and since layout has changed, it's totally fine rel_logits = bilinear(dep_rel, self.rel_W, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=len(self.vocab_deprel), bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, len(self.vocab_deprel)), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = self.dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D # 这里的形状如此, 需要用 mask1d rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (len(self.vocab_deprel), seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * self.dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by ones msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax( rel_prob, sent_len, self.vocab_deprel, "root" if "root" in self.vocab_deprel.stoi else "ROOT") outputs.append( (arc_pred[1:sent_len], rel_pred[1:sent_len])) # w_0 is <roor> assert (len(outputs) == batch_size) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def attend(self, context, x): context_cols = dy.concatenate_cols(context) hidden = dy.tanh(dy.colwise_add(self.Wha * context_cols, self.Wia * x)) weights = dy.softmax(dy.transpose(hidden) * self.Va) context_emb = context_cols * weights return context_emb, weights
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [ dy.concatenate([x.output(), y.output()]) for x, y in LSTM_SRC.add_inputs( [dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws]) ] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append( [sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s( [src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input( dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh( dy.affine_transform( [b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1, ), len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def attend(self, context, x): context_cols = dy.concatenate_cols(context) weights = dy.softmax(dy.transpose(context_cols) * self.W * x) context_emb = context_cols * weights return context_emb, weights
def get_combined_word_representations(self, sentence, training=None): """ :param training: :param sentence: whole sentence with input values as ids :return: word representations made up according to the user preferences """ if training is None: training = self.training representations_to_be_zipped = [] word_embedding_based_representations = \ [self.word_embeddings[word_id] for word_id in sentence['word_ids']] representations_to_be_zipped.append( dynet.concatenate([ dynet.transpose(x) for x in word_embedding_based_representations ])) char_representations = self.get_char_representations(sentence) representations_to_be_zipped.append( dynet.concatenate( [dynet.transpose(x) for x in char_representations])) if self.parameters[ 'use_golden_morpho_analysis_in_word_representation']: morph_tag_based_representations = self.get_morph_analysis_representation_in_old_style( sentence) representations_to_be_zipped.append( dynet.concatenate([ dynet.transpose(x) for x in morph_tag_based_representations ])) if self.parameters['cap_dim'] > 0: cap_embedding_based_representations = \ [self.cap_embeddings[cap_id] for cap_id in sentence['cap_ids']] representations_to_be_zipped.append( dynet.concatenate([ dynet.transpose(x) for x in cap_embedding_based_representations ])) # combined_word_representations = [dynet.concatenate([x, y, z, xx]) for x, y, z, xx in # zip(*representations_to_be_zipped)] # else: # combined_word_representations = [dynet.concatenate([x, y, xx]) for x, y, xx in # zip(*representations_to_be_zipped)] combined_word_representations = dynet.concatenate_cols( representations_to_be_zipped) # print combined_word_representations # print self.parameters if training: combined_word_representations = [ dynet.dropout(x, p=self.parameters['dropout']) for x in combined_word_representations ] else: combined_word_representations = [ x for x in combined_word_representations ] return combined_word_representations
def calculate_batch_loss(self, batch): dy.renew_cg() W_y = dy.parameter(self.params["W_y"]) b_y = dy.parameter(self.params["b_y"]) s_lookup = self.params["s_lookup"] t_lookup = self.params["t_lookup"] s_batch = [x[0] for x in batch] t_batch = [x[1] for x in batch] wids = [] for i in range(len(s_batch[0])): wids.append([sent[i] for sent in s_batch]) wids_rev = list(reversed(wids)) l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for wid in wids: l2r_state = l2r_state.add_input(dy.lookup_batch(s_lookup, wid)) l2r_contexts.append(l2r_state.output()) for wid in wids_rev: r2l_state = r2l_state.add_input(dy.lookup_batch(s_lookup, wid)) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() losses = [] H_f = [] H_f = [dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts)] H_f_mat = dy.concatenate_cols(H_f) W1_att = dy.parameter(self.params["W1_att"]) w1dt = W1_att * H_f_mat t_wids = [] masks = [] num_words = 0 for i in range(len(t_batch[0])): t_wids.append([(sent[i] if len(sent) > i else self.t_vocab[EOS]) for sent in t_batch]) mask = [(1 if len(sent) > i else 0) for sent in t_batch] masks.append(mask) num_words += sum(mask) c_t = dy.vecInput(2*self.HIDDEN_DIM) words = [self.t_vocab[EOS]] * len(t_batch) embedding = dy.lookup_batch(t_lookup, words) dec_state = self.dec_builder.initial_state() for t_wid, mask in zip(t_wids, masks): x_t = dy.concatenate([c_t, embedding]) dec_state = dec_state.add_input(x_t) c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_batch[0]), len(wids[0])) probs = dy.affine_transform([b_y, W_y, dy.concatenate([c_t, dec_state.output()])]) loss = dy.pickneglogsoftmax_batch(probs, t_wid) if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(t_batch)) loss = loss * mask_expr losses.append(loss) embedding = dy.lookup_batch(t_lookup, t_wid) loss = dy.sum_batches(dy.esum(losses)) # /len(wids[0]) return loss, num_words
def step_batch(self, batch, lang): dy.renew_cg() W_y = dy.parameter(self.W_y[lang]) b_y = dy.parameter(self.b_y[lang]) W1_att_e = dy.parameter(self.W1_att_e) W1_att_f = dy.parameter(self.W1_att_f) w2_att = dy.parameter(self.w2_att) M_s = self.src_lookup M_t = self.tgt_lookup[lang] src_sent, tgt_sent = zip(*batch) src_sent = zip(*src_sent) tgt_sent = zip(*tgt_sent) src_sent_rev = list(reversed(src_sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(src_sent, src_sent_rev): l2r_state = l2r_state.add_input(dy.lookup_batch(M_s, cw_l2r)) r2l_state = r2l_state.add_input(dy.lookup_batch(M_s, cw_r2l)) l2r_contexts.append(l2r_state.output()) # [<S>, x_1, x_2, ..., </S>] r2l_contexts.append(r2l_state.output()) # [</S> x_n, x_{n-1}, ... <S>] # encoded_h1 = l2r_state.output() # tem1 = encoded_h1.npvalue() r2l_contexts.reverse() # [<S>, x_1, x_2, ..., </S>] # Combine the left and right representations for every word h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) encoded_h = h_fs[-1] h_fs_matrix = dy.concatenate_cols(h_fs) # h_fs_matrix_t = dy.transpose(h_fs_matrix) losses = [] num_words = 0 # Decoder c_t = dy.vecInput(self.hidden_size * 2) c_t.set([0 for i in xrange(self.contextsize)]) encoded_h = dy.concatenate([encoded_h]) dec_state = self.dec_builder[lang].initial_state([encoded_h]) for (cw, nw) in zip(tgt_sent[0:-1], tgt_sent[1:]): embed = dy.lookup_batch(M_t, cw) dec_state = dec_state.add_input(dy.concatenate([embed, c_t])) h_e = dec_state.output() #calculate attention ''' a_t = h_fs_matrix_t * h_e alignment = dy.softmax(a_t) c_t = h_fs_matrix * alignment''' c_t = self.__attention_mlp_batch(h_fs_matrix, h_e, W1_att_e, W1_att_f, w2_att) ind_tem = dy.concatenate([h_e, c_t]) ind_tem1 = W_y * ind_tem ind_tem2 = ind_tem1 + b_y loss = dy.pickneglogsoftmax_batch(ind_tem2, nw) # to modify losses.append(loss) num_words += 1 return dy.sum_batches(dy.esum(losses)), num_words
def generate(self, s_sentence, max_len=150): dy.renew_cg() W_y = dy.parameter(self.params["W_y"]) b_y = dy.parameter(self.params["b_y"]) s_lookup = self.params["s_lookup"] t_lookup = self.params["t_lookup"] s_sentence = [self.s_vocab[EOS]] + s_sentence + [self.s_vocab[EOS]] s_sentence_rev = list(reversed(s_sentence)) l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for cw_l2r in s_sentence: l2r_state = l2r_state.add_input(s_lookup[cw_l2r]) l2r_contexts.append(l2r_state.output()) for cw_r2l in s_sentence_rev: r2l_state = r2l_state.add_input(s_lookup[cw_r2l]) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() H_f = [] H_f = [dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts)] H_f_mat = dy.concatenate_cols(H_f) W1_att = dy.parameter(self.params["W1_att"]) w1dt = W1_att * H_f_mat c_t = dy.vecInput(2*self.HIDDEN_DIM) embedding = t_lookup[self.t_vocab["<EOS>"]] dec_state = self.dec_builder.initial_state() t_sentence = [] count_eos = 0 for i in range(len(s_sentence)*2): if count_eos == 2: break x_t = dy.concatenate([c_t, embedding]) dec_state = dec_state.add_input(x_t) c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_sentence), 1) probs = dy.softmax(W_y*dy.concatenate([c_t, dec_state.output()]) + b_y).vec_value() word = probs.index(max(probs)) embedding = t_lookup[word] if self.t_id_lookup[word] == "<EOS>": count_eos += 1 continue t_sentence.append(self.t_id_lookup[word]) return " ".join(t_sentence)