def _forward(self, emissions): """Viterbi forward to calculate all path scores. :param emissions: List[dy.Expression] Returns: dy.Expression ((1,), B) """ init_alphas = [-1e4] * self.n_tags init_alphas[self.start_idx] = 0 alphas = dy.inputVector(init_alphas) transitions = self.transitions # len(emissions) == T for emission in emissions: add_emission = dy.colwise_add(transitions, emission) scores = dy.colwise_add(dy.transpose(add_emission), alphas) # dy.logsumexp takes a list of dy.Expression and computes logsumexp # elementwise across the lists so for example the logsumexp is calculated # for [0] in each list. This means we want the scores for a given # transition scores for a tag to be in the columns alphas = dy.logsumexp([x for x in scores]) last_alpha = alphas + dy.pick(transitions, self.end_idx) alpha = dy.logsumexp([x for x in last_alpha]) return alpha
def calc_attention(self, state): V = dy.parameter(self.pV) U = dy.parameter(self.pU) WI = self.WI curr_sent_mask = self.curr_sent.mask if self.attention_vecs: conv_feats = dy.conv2d(self.attention_vecs[-1], self.pL, stride=[1, 1], is_valid=False) conv_feats = dy.transpose( dy.reshape(conv_feats, (conv_feats.dim()[0][0], self.hidden_dim), batch_size=conv_feats.dim()[1])) h = dy.tanh(dy.colwise_add(WI + conv_feats, V * state)) else: h = dy.tanh(dy.colwise_add(WI, V * state)) scores = dy.transpose(U * h) if curr_sent_mask is not None: scores = curr_sent_mask.add_to_tensor_expr(scores, multiplicator=-100.0) normalized = dy.softmax(scores) self.attention_vecs.append(normalized) return normalized
def softmax(x): """ Compute the softmax function in tensorflow. You might find the tensorflow functions tf.exp, tf.reduce_max, tf.reduce_sum, tf.expand_dims useful. (Many solutions are possible, so you may not need to use all of these functions). Recall also that many common tensorflow operations are sugared (e.g. x * y does a tensor multiplication if x and y are both tensors). Make sure to implement the numerical stability fixes as in the previous homework! Args: x: tf.Tensor with shape (n_samples, n_features). Note feature vectors are represented by row-vectors. (For simplicity, no need to handle 1-d input as in the previous homework) Returns: out: tf.Tensor with shape (n_sample, n_features). You need to construct this tensor in this problem. """ ### YOUR CODE HERE x_max = dy.max_dim(x, 1) x_sub = dy.colwise_add(x, -x_max) x_exp = dy.exp(x_sub) sum_exp = dy.colwise_add(dy.zeroes(x.dim()[0]), dy.sum_cols(x_exp)) out = dy.cdiv(x_exp, sum_exp) ### END YOUR CODE return out
def hier_attend(self, context_pre, context_pos, state): w2 = dy.parameter(self.hier_w2) v = dy.parameter(self.hier_v) w2dt = w2 * dy.concatenate(list(state.s())) # context_pre w1_pre = dy.parameter(self.hier_w1_pre) w1dt_pre = w1_pre * context_pre energy_pre = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt_pre, w2dt))) w_pre = dy.parameter(self.hier_w_pre) wdt_pre = w_pre * context_pre # context_pos w1_pos = dy.parameter(self.hier_w1_pos) w1dt_pos = w1_pos * context_pos energy_pos = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt_pos, w2dt))) w_pos = dy.parameter(self.hier_w_pos) wdt_pos = w_pos * context_pos beta = dy.softmax(dy.concatenate([energy_pre, energy_pos])) wdt = dy.concatenate_cols([wdt_pre, wdt_pos]) context = wdt * beta return context
def attend_with_prev(self, state, w1dt, prev_att): w2dt = self.attention_w2 * state w3dt = self.attention_w3 * prev_att unnormalized = dy.transpose( self.attention_v * dy.tanh(dy.colwise_add(dy.colwise_add(w1dt, w2dt), w3dt))) att_weights = dy.softmax(unnormalized) return att_weights
def softmax(x): ### YOUR CODE HERE x_max = dy.max_dim(x, 1) x_sub = dy.colwise_add(x, -x_max) x_exp = dy.exp(x_sub) x_sum = dy.sum_cols(x_exp) x_tmp = dy.zeroes(x.dim()[0]) x_tmp = dy.colwise_add(x_tmp, x_sum) out = dy.cdiv(x_exp, x_tmp) ### END YOUR CODE return out
def get_word_att(self, ut, l, s): input_mat = dy.concatenate_cols(ut.words_enc) unnormalized = dy.transpose(self.attention_word_v * dy.tanh( dy.colwise_add( dy.colwise_add(self.attention_word_w1 * input_mat, self.attention_word_w2 * l), self.attention_word_w3 * s))) att_weights = dy.softmax(unnormalized) ut.context = input_mat * att_weights
def transform(sentence): w1 = dy.parameter(transform_w1) b1 = dy.parameter(transform_b1) w2 = dy.parameter(transform_w2) b2 = dy.parameter(transform_b2) sentence_transformed = dy.colwise_add(w1 * sentence, b1) sentence_transformed = dy.rectify(sentence_transformed) sentence_transformed = dy.colwise_add(w2 * sentence_transformed, b2) sentence_transformed = dy.rectify(sentence_transformed) return sentence_transformed
def __attention_mlp(self, H_f, h_e, W1_att_e, W1_att_f, w2_att, W1_att_lang, langeb): # Calculate the alignment score vector a_t = dy.tanh( dy.colwise_add(dy.colwise_add(W1_att_f * H_f, W1_att_e * h_e), W1_att_lang * langeb)) a_t = w2_att * a_t a_t = a_t[0] alignment = dy.softmax(a_t) c_t = H_f * alignment return c_t
def __attention_mlp_batch(self, H_f_batch, h_e_batch, W1_att_e, W1_att_f, w2_att, W1_att_lang, langeb): # H_f_batch: (2 * hidden_size, num_step, batch_size) # h_e_batch: (hidden_size, batch_size) a_t_batch = dy.tanh( dy.colwise_add( dy.colwise_add(W1_att_f * H_f_batch, W1_att_e * h_e_batch), W1_att_lang * langeb)) # (attention_size, num_step, batch_size) a_t_batch = w2_att * a_t_batch # (1, num_step, batch_size) a_t_batch = a_t_batch[0] # (num_step, batch_size) alignment_batch = dy.softmax(a_t_batch) # (num_step, batch_size) c_t_batch = H_f_batch * alignment_batch # (2 * hidden_size, batch_size) return c_t_batch
def get_v1_v2(alpha, beta, sen1, sen2, model_params): G_w1 = model_params['G_w1'] G_b1 = model_params['G_b1'] G_w2 = model_params['G_w2'] G_b2 = model_params['G_b2'] con = dy.concatenate([sen1, beta], d=0) #con = dy.dropout(con, DROPOUT_RATE) v1 = dy.rectify(G_w2 * (dy.rectify(dy.colwise_add(G_w1 * con, G_b1))) + G_b2) con = dy.concatenate([sen2, alpha], d=0) #con = dy.dropout(con, DROPOUT_RATE) v2 = dy.rectify(G_w2 * (dy.rectify(dy.colwise_add(G_w1 * con, G_b1))) + G_b2) return v1, v2
def set_E_matrix(sen1, sen2, model_params): F_w1 = model_params['F_w1'] F_b1 = model_params['F_b1'] F_w2 = model_params['F_w2'] F_b2 = model_params['F_b2'] #sen1 = dy.dropout(sen1, DROPOUT_RATE) #sen2 = dy.dropout(sen2, DROPOUT_RATE) F_sen1 = dy.rectify(F_w2 * (dy.rectify(dy.colwise_add(F_w1*sen1, F_b1))) + F_b2) F_sen2 = dy.rectify(F_w2 * (dy.rectify(dy.colwise_add(F_w1*sen2, F_b1))) + F_b2) E_matrix = (dy.transpose(F_sen1)) * F_sen2 return E_matrix, F_sen1, F_sen2
def calc_attention(self, src_trans_att, h_t, training=True): with parameters(self.W_h, self.U, trainable=training) as (W_h, U): att_hidden = dy.tanh(dy.colwise_add(src_trans_att, W_h * h_t)) att_weights = dy.transpose(U * att_hidden) att_weights = dy.softmax(att_weights) return att_weights
def attend(self, encodings, h): """Compute attention score Given :math:`z_i` the encoder's output at time :math:`i`, :math:`h_{j-1}` the decoder's output at time :math:`j-1`, the attention score is computed as : .. math:: \begin{split} s_{ij}&=V_a^T\tanh(W_az_i + W_{ha}h_j + b_a)\\ \alpha_{ij}&=\frac{s_{ij}}{\sum_{i'}s_{i'j}}\\ \end{split} Arguments: encodings (dynet.Expression): Source sentence encodings obtained with self.encode h (dynet.Expression): Decoder output at the previous timestep Returns: tuple: Two dynet Expressions, the context and the attention weights """ Va, Wa, Wha = self.Va_p.expr(), self.Wa_p.expr(), self.Wha_p.expr() d = dy.tanh(dy.colwise_add(Wa * encodings, Wha * h)) scores = dy.transpose(d) * Va weights = dy.softmax(scores) context = encodings * weights return context, weights
def decode(self, emissions): """Viterbi decode to find the best sequence. :param emissions: List[dy.Expression] Returns: List[int], dy.Expression ((1,), B) """ if self.add_ends: emissions = CRF._prep_input(emissions) backpointers = [] transitions = self.transitions inits = [-1e4] * self.n_tags inits[self.start_idx] = 0 alphas = dy.inputVector(inits) for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transitions), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transitions, self.end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def attend_tags(self, state, w1dt): w2dt = self.tag_attention_w2 * state unnormalized = dy.transpose(self.tag_attention_v * dy.tanh(dy.colwise_add(w1dt, w2dt))) att_weights = dy.softmax(unnormalized) return att_weights
def attend(self, w1dt, vectors,state): import time start = time.time() if debug: print "In attention" w2 = dy.parameter(self.attention_w2) v = dy.parameter(self.attention_v) if debug: print "Shape of w2: ", np.asarray(w2.value()).shape print "Shape of state : " , np.asarray(dy.concatenate(list(state.s())).value()).shape end = time.time() start = end w2dt = w2 * dy.concatenate(list(state.s())) end = time.time() if debug: print " Shape of W2dt: ", np.asarray(w2dt.value()).shape unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) if debug: print "Shape of unnormalized: ", np.asarray(unnormalized.value()).shape end = time.time() att_weights = dy.softmax(unnormalized) if debug: print "Shape of Attention weights: ", np.asarray(att_weights.value()).shape end = time.time() context = vectors * att_weights if debug: print "Shape of context: ", np.asarray(context.value()).shape #print "Context: " , np.asarray(context.value()) return context
def generate(self, h_a, trg, maxlen=100): #decode(self, h_a, trg, decorate=False): h_a += ([dy.zeros(self.hdim)] * (self.max_len - len(h_a)) ) #padding to make equal to maxlength h_ak = dy.concatenate(h_a, 1) #pdb.set_trace() pre_attend = dy.parameter(self.pre_attend) context = h_ak * pre_attend prev_out = dy.zeros((self.hdim)) outputs = [] s = self.decoder_rnn.initial_state() for i in range(maxlen): attender = dy.parameter(self.attender) #pdb.set_trace() V = dy.parameter(self.v) tmp = dy.tanh(dy.colwise_add(context, V * prev_out)) U = dy.parameter(self.u) attention_weights = dy.softmax(dy.transpose(U * tmp)) #pdb.set_trace() emb = dy.concatenate([h_ak * attention_weights, prev_out]) s = s.add_input(emb) prev_out = s.output() pre2 = dy.parameter(self.pred) pre2 * prev_out outputs.append(pre2 * prev_out) act_value = pre2 * prev_out act_value = np.argmax(act_value.value()) outputs.append(act_value) if act_value == 1: return outputs return outputs
def attend(input_mat, state, w1dt_array): # Takes in [l * 2hE * n], [l * 2hD * n], [l * a * n] as input and returns [l * a * n] as output global attention_w2 global attention_v w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) if debug_dimensions: print "In attention " print " Dimensions of input mat are : ", get_tensor_size(input_mat) print " Dimensions of w1dt array: ", get_tensor_size(w1dt_array) print " Dimensions of state ", len(state) # Get w2dt = weight matrix * decoder state output w2dt_array = [] for s in state: w2dt = w2*dy.concatenate(list(s.s())) w2dt_array.append(w2dt) if debug_dimensions: print " Dimensions of w2dt array: ", get_tensor_size(w2dt_array) unnormalized_array = [] att_weights_array = [] for (a,b) in zip(w1dt_array, w2dt_array): unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(a,b))) att_weights = dy.softmax(unnormalized) att_weights_array.append(att_weights) if debug_dimensions: print " Dimensions of attention weights array: ", get_tensor_size(att_weights_array) context_array = [] for (im, at) in zip(input_mat, att_weights_array): context = im * at context_array.append(context) if debug_dimensions: print " Dimensions of contexts array: ", get_tensor_size(context_array) return context_array
def cal_scores(self, s): if len(self.ps) == 0: return None hs_matrix = dy.tanh( dy.colwise_add(dy.concatenate_cols(self.ps), W * s)) return dy.softmax(dy.transpose(b * hs_matrix))
def _vaswani_model_scores(m): out_c2 = dy.rectify( dy.colwise_add(c2_Wlm * m["beam_lm_hs"], dy.pick(m["aux_c2"], m["idx"], 1))) # if cfg["use_beam_bilstm"]: # _, beam_size_prev = out_c2.dim()[0] # beam_hs = [dy.pick(out_c2, i, 1) for i in xrange(beam_size_prev)] # bf_init = b_fwd.initial_state() # bb_init = b_bwd.initial_state() # bf_hs = dy.concatenate_cols(bf_init.transduce(beam_hs)) # bb_hs = dy.concatenate_cols(bb_init.transduce(reversed(beam_hs))[::-1]) # out_c2 = dy.concatenate([bf_hs, bb_hs]) # if cfg["use_beam_mlp"]: # out_b = dy.max_dim(b_W1 * out_c2 + b_b1, 1) # out_c2 = dy.colwise_add(out_c2, dy.rectify(b_W2 * out_b + b_b2)) scores = o_W * out_c2 + o_b scores = dy.transpose(scores) if cfg["accumulate_scores"]: scores = m["acc_scores"] + scores m["scores"] = scores return scores
def viterbi(emissions, transition, start_idx, end_idx, norm=False): n_tags = emissions[0].dim()[0][0] backpointers = [] inits = [-1e4] * n_tags inits[start_idx] = 0 alphas = dy.inputVector(inits) alphas = dy.log_softmax(alphas) if norm else alphas for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transition), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transition, end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def attend(self, enc_h_ts_mat, dec_h_t, encatt, store_weights=False): """ Parameters: ----------- enc_h_ts_mat: dynet.Expression, (seq_len x enc_hid_dim) matrix of encoding hidden state column vectors dec_h_t: dynet.RNNState, (dec_hid_dim) current decoder hidden state encatt: dynet.Expression, (seq_len x att_dim) projection of the encoder hidden states into the attention space store_weights: bool, whether to store attention weights """ dec2att = dy.parameter(self.dec2att) att_v = dy.parameter(self.att_v) # project output of last hidden layer (state.h()[-1] == state.output()) # to the dimensionality of the attention space decatt = dec2att * dec_h_t.output() # projection vector att_v # unnormalized var-len alignment vector (with len == source seq len) # (seq_len) unnormalized_weights = att_v * dy.tanh(dy.colwise_add(encatt, decatt)) weights = dy.softmax(dy.transpose(unnormalized_weights)) if store_weights: self.current_weights.append(weights.value()) context = enc_h_ts_mat * weights return context
def combine(sentence, sentence_other_attended): w1 = dy.parameter(combine_w1) b1 = dy.parameter(combine_b1) w2 = dy.parameter(combine_w2) b2 = dy.parameter(combine_b2) sentence_combine = dy.concatenate([sentence, sentence_other_attended], d=0) logging.debug("Sentence combined with Attended shape: " + str(sentence_combine.dim())) combine_transformed = dy.colwise_add(w1 * sentence_combine, b1) combine_transformed = dy.rectify(combine_transformed) combine_transformed = dy.colwise_add(w2 * combine_transformed, b2) combine_transformed = dy.rectify(combine_transformed) return combine_transformed
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component): w1_att_src = dy.parameter(w1_att_src_p) w1_att_tgt = dy.parameter(w1_att_tgt_p) w2_att = dy.parameter(w2_att_p) a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att alignment = dy.softmax(a_t) att_output = src_output_matrix * alignment return att_output, alignment
def calc_attention(self, state): V = dy.parameter(self.pV) U = dy.parameter(self.pU) h = dy.tanh(dy.colwise_add(self.WI, V * state)) scores = dy.transpose(U * h) return dy.softmax(scores)
def attend(self, input_mat, state, w1dt): w2 = dy.parameter(self.attention_w2) v = dy.parameter(self.attention_v) w2dt = w2 * dy.concatenate(list(state.s())) att_weights = dy.softmax( dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt)))) context = input_mat * att_weights return context
def get_utt_att(self, uts, s): input_mat = dy.concatenate_cols([u.utt_enc for u in uts]) unnormalized = dy.transpose(self.attention_word_v * dy.tanh( dy.colwise_add(self.attention_utt_w1 * input_mat, self.attention_utt_w2 * s))) att_weights = dy.softmax(unnormalized) return input_mat * att_weights
def _attend(self, query, mask=None): # query ((H), B) # mask ((T, 1), B) projected_state = self.decoder * query # ((H,), B) non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state)) # ((H, T), B) attn_scores = dy.transpose(self.v * non_lin) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores) # ((T, 1), B)
def calc_attention(self, state): V = dy.parameter(self.pV) U = dy.parameter(self.pU) h = dy.tanh(dy.colwise_add(self.WI, V * state)) scores = dy.transpose(U * h) normalized = dy.softmax(scores) self.attention_vecs.append(normalized) return normalized
def __attention_mlp(self, h_fs_matrix, h_e, fixed_attentional_component): W1_att_e = dy.parameter(self.W1_att_e) w2_att = dy.parameter(self.w2_att) a_t = dy.transpose( dy.tanh(dy.colwise_add(fixed_attentional_component, W1_att_e * h_e))) * w2_att alignment = dy.softmax(a_t) c_t = h_fs_matrix * alignment return c_t
def attend(input_mat, state, w1dt): global attention_w2 global attention_v w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) # input_mat: (encoder_state x seqlen) => input vecs concatenated as cols # w1dt: (attdim x seqlen) # w2dt: (attdim x attdim) w2dt = w2*dy.concatenate(list(state.s())) # att_weights: (seqlen,) row vector unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) att_weights = dy.softmax(unnormalized) # context: (encoder_state) context = input_mat * att_weights return context
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols(src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify(dy.affine_transform([b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify(dy.affine_transform([b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify(dy.affine_transform([b_label_hidden_to_head, W_label_hidden_to_head, src_encodings])) h_label_dep = dy.rectify(dy.affine_transform([b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label