Exemplo n.º 1
0
def flatten_triple(action_scores, location_scores, argument_scores):
    """ Flattens three scores vectors by summing over all possibilities. """
    num_actions = action_scores.dim()[0][0]
    num_locations = location_scores.dim()[0][0]
    num_arguments = argument_scores.dim()[0][0]

    expanded_arguments = dy.reshape(argument_scores, (num_arguments, 1)) \
        * dy.ones((1, num_locations))
    expanded_locations = dy.ones((num_arguments, 1)) \
        * dy.reshape(location_scores, (1, num_locations))

    # num_locations x num_arguments
    location_and_argument_scores = expanded_locations + expanded_arguments
    location_and_argument_expanded = dy.reshape(location_and_argument_scores,
                                                (num_locations * num_arguments, 1)) \
        * dy.ones((1, num_actions))

    expanded_actions = dy.ones((num_arguments * num_locations, 1)) \
        * dy.reshape(action_scores, (1, num_actions))

    final_scores = location_and_argument_expanded + expanded_actions

    # num_actions * num_locations x num_arguments
    final_scores = dy.reshape(final_scores, (num_actions * num_locations * num_arguments, 1))

    return final_scores
Exemplo n.º 2
0
 def init_sequence(self, test=False):
     self.test = test
     if not test:
         self.dropout_mask_x = dy.dropout(dy.ones((self.n_in, )),
                                          self.dropout_x)
         self.dropout_mask_h = dy.dropout(dy.ones((self.n_hidden, )),
                                          self.dropout_h)
Exemplo n.º 3
0
    def set_dropouts(self, input_drop=0, recur_drop=0):

        self.input_drop = input_drop
        self.recur_drop = recur_drop
        self.input_drop_mask = dy.dropout(dy.ones(self.input_size),
                                          self.input_drop)
        self.recur_drop_mask = dy.dropout(dy.ones(self.recur_size),
                                          self.recur_drop)
Exemplo n.º 4
0
    def cal_scores(self, src_encodings):
        src_len = len(src_encodings)

        src_encodings = dy.concatenate_cols(
            src_encodings)  # src_ctx_dim, src_len, batch_size

        W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head)
        b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head)
        W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep)
        b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep)

        W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head)
        b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head)
        W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep)
        b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep)

        U_arc_1 = dy.parameter(self.U_arc_1)
        u_arc_2 = dy.parameter(self.u_arc_2)

        U_label_1 = [dy.parameter(x) for x in self.U_label_1]
        u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1]
        u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2]
        b_label = [dy.parameter(x) for x in self.b_label]

        h_arc_head = dy.rectify(
            dy.affine_transform(
                [b_arc_hidden_to_head, W_arc_hidden_to_head,
                 src_encodings]))  # n_arc_ml_units, src_len, bs
        h_arc_dep = dy.rectify(
            dy.affine_transform(
                [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings]))
        h_label_head = dy.rectify(
            dy.affine_transform([
                b_label_hidden_to_head, W_label_hidden_to_head, src_encodings
            ]))
        h_label_dep = dy.rectify(
            dy.affine_transform(
                [b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings]))

        h_arc_head_transpose = dy.transpose(h_arc_head)
        h_label_head_transpose = dy.transpose(h_label_head)

        s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep,
                                                      u_arc_2)

        s_label = []
        for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2,
                                        b_label):
            e1 = h_label_head_transpose * U_1 * h_label_dep
            e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len))
            e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep
            s_label.append(e1 + e2 + e3 + b)
        return s_arc, s_label
Exemplo n.º 5
0
 def l2_normalize(vector):
     square_sum = dy.sqrt(
         dy.bmax(
             dy.sum_elems(dy.square(vector)),
             np.finfo(float).eps * dy.ones((1))[0],
         ))
     return dy.cdiv(vector, square_sum)
Exemplo n.º 6
0
	def beam_decode(self, encodings, input_len=10, beam_size=1):
		batch_size = 1
		self.__dec.init_params(encodings, batch_size, self.__train_flag)
		context = dy.zeros((self.__enc.output_dim, ))
		beams = [Beam(self.__dec.dec_state, context, [self.__trg_sos], 0.0)]

		for i in xrange(int(min(self.__max_len, input_len * 1.5))):
			new_beams = []
			p_list = []
			for b in beams:
				if b.words[-1] == self.__trg_eos:
					p_list.append(dy.ones((self.__trg_vsize, )))
					continue
				hidden, embs, b.state = self.__dec.next([b.words[-1]], b.context, self.__train_flag, b.state)
				b.context, _ = self.attend(encodings, hidden)
				score = self.__dec.score(hidden, b.context, embs, self.__train_flag)
				p_list.append(dy.softmax(score))
			p_list = dy.concatenate_to_batch(p_list).npvalue().T.reshape(-1, self.__trg_vsize)
			for p, b in zip(p_list, beams):
				p = p.flatten() / p.sum()
				kbest = np.argsort(p)
				if b.words[-1] == self.__trg_eos:
					new_beams.append(Beam(b.state, b.context, b.words, b.log_prob))
				else:
					for next_word in kbest[-beam_size:]:
						new_beams.append(Beam(b.state, b.context, b.words + [next_word], b.log_prob + np.log(p[next_word])))
			beams = sorted(new_beams, key=lambda b: b.log_prob)[-beam_size:]
			if beams[-1].words[-1] == self.__trg_eos:
				break
		return beams[-1].words
Exemplo n.º 7
0
    def __init__(self,model,input_size,recur_size,forget_bias=0.0):

        self.input_size = input_size
        self.recur_size = recur_size
        self.input_drop_mask = dy.ones(self.input_size)
        self.recur_drop_mask = dy.ones(self.recur_size)
        self.forget_bias = forget_bias
        self.cell_previous = None
        self.hidden_previous = None
        self.init = False
        self.input_drop = 0
        self.recur_drop = 0

        Saxe_initializer = Saxe.Orthogonal()
        gates_init = Saxe_initializer(((self.recur_size,self.input_size+self.recur_size)))
        gates_init  = np.concatenate([gates_init]*4)
        self.WXH = model.add_parameters((self.recur_size*4,self.input_size+self.recur_size), init=dy.NumpyInitializer(gates_init))
        self.b = model.add_parameters((self.recur_size*4), init = dy.ConstInitializer(0))
Exemplo n.º 8
0
    def cal_scores(self, src_encodings):
        src_len = len(src_encodings)

        src_encodings = dy.concatenate_cols(src_encodings)  # src_ctx_dim, src_len, batch_size

        W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head)
        b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head)
        W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep)
        b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep)

        W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head)
        b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head)
        W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep)
        b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep)

        U_arc_1 = dy.parameter(self.U_arc_1)
        u_arc_2 = dy.parameter(self.u_arc_2)

        U_label_1 = [dy.parameter(x) for x in self.U_label_1]
        u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1]
        u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2]
        b_label = [dy.parameter(x) for x in self.b_label]

        h_arc_head = dy.rectify(dy.affine_transform([b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings]))  # n_arc_ml_units, src_len, bs
        h_arc_dep = dy.rectify(dy.affine_transform([b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings]))
        h_label_head = dy.rectify(dy.affine_transform([b_label_hidden_to_head, W_label_hidden_to_head, src_encodings]))
        h_label_dep = dy.rectify(dy.affine_transform([b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings]))

        h_arc_head_transpose = dy.transpose(h_arc_head)
        h_label_head_transpose = dy.transpose(h_label_head)

        s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2)

        s_label = []
        for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label):
            e1 = h_label_head_transpose * U_1 * h_label_dep
            e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len))
            e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep
            s_label.append(e1 + e2 + e3 + b)
        return s_arc, s_label
Exemplo n.º 9
0
 def __call__(self, X):
     d_x = X.dim()[0][0]
     d_y = X.dim()[0][1]
     g = dy.ones((d_x, d_y))
     b = dy.zeros((d_x, d_y))
     Y = []
     for attention in self.attention:
         Y.append(attention(X))
     Y = dy.esum(Y)
     Y = dy.layer_norm(X + Y, g, b)
     Y = dy.layer_norm(Y + dy.transpose(self.feedforward(dy.transpose(Y))),
                       g, b)
     return Y
Exemplo n.º 10
0
def _lm_model_scores(m):
    # assert not (cfg["use_beam_bilstm"] or cfg["use_beam_mlp"])

    idx = m["idx"]
    cur_beam_size = len(m["beam_lm_states"])
    q = dy.reshape(m["i_embs"][idx], (input_dim, 1)) * dy.ones(
        (1, cur_beam_size))
    x = dy.concatenate([m["beam_lm_hs"], q])
    scores = W * x + b
    scores = dy.transpose(scores)

    if cfg["accumulate_scores"]:
        scores = m["acc_scores"] + scores
        m["scores"] = scores

    return scores
Exemplo n.º 11
0
    def train(self, train_path):
        with open(train_path, "r") as train:
            shuffledData = list(read_data(train))
            random.shuffle(shuffledData)

            for iPair, (sentence, lf) in enumerate(shuffledData):
                print(iPair, sentence)
                # I-Context Encoding
                lstm_forward = self.context_encoder[0].initial_state()

                for entry in sentence:
                    lstm_forward = lstm_forward.add_input(
                        self.wlookup[self.w2i[entry] if entry in
                                     self.w2i else self.w2i["_UNK"]])
                hidden_context = lstm_forward.h()
                init_h = [dy.ones(self.ldims)]
                state = self.logical_form_decoder.initial_state()
                state.set_h(init_h)

                dy.renew_cg()
Exemplo n.º 12
0
  def GetQDScore(self, qwds, qw2v, qvecs, dwds, dw2v, dvecs, extra,
                 train=False):
    nq = len(qvecs)
    nd = len(dvecs)
    qgl = [self.W_gate.expr() *
           dy.concatenate([qv, dy.constant(1, self.idf_val(qw))])
           for qv, qw in zip(qvecs, qwds)]
    qgates = dy.softmax(dy.concatenate(qgl))

    sims = []
    for qv in qvecs:
      dsims = []
      for dv in dvecs:
        dsims.append(self.Cosine(qv, dv))
      sims.append(dsims)

    w2v_sims = []
    for qv in qw2v:
      dsims = []
      for dv in dw2v:
        dsims.append(self.Cosine(qv, dv))
      w2v_sims.append(dsims)

    matches = []
    for qw in qwds:
      dmatch = []
      for dw in dwds:
        dmatch.append(dy.ones(1) if qw == dw else dy.zeros(1))
      matches.append(dmatch)

    qscores = self.GetPOSIT(qvecs, sims, w2v_sims, matches)

    # Final scores and ultimate classifier.
    qterm_score = dy.dot_product(dy.concatenate(qscores), qgates)

    fin_score = (self.W_final.expr() * dy.concatenate([qterm_score,
                                                       extra]))
    return fin_score
Exemplo n.º 13
0
 def test_set_s(self):
     dy.renew_cg()
     init_s = [dy.ones(10), dy.ones(10), dy.ones(10), dy.ones(10)]
     state = self.rnn.initial_state()
     state.set_s(init_s)
Exemplo n.º 14
0
 def test_initial_state_vec(self):
     dy.renew_cg()
     init_s = [dy.ones(10), dy.ones(10), dy.ones(10), dy.ones(10)]
     self.rnn.initial_state(init_s)
Exemplo n.º 15
0
 def test_initial_state_vec(self):
     dy.renew_cg()
     init_s = [dy.ones(10), dy.ones(10), dy.ones(10), dy.ones(10)]
     self.rnn.initial_state(init_s)
Exemplo n.º 16
0
    def set_dropout_masks(self,batch_size):

        self.input_drop_mask = dy.dropout(dy.ones((self.input_size),batch_size),self.input_drop)
        self.recur_drop_mask = dy.dropout(dy.ones((self.recur_size),batch_size),self.recur_drop)
Exemplo n.º 17
0
def calc_loss(sent, epsilon=0.0):
    #dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    tags = sent[1]

    # initialize the LSTM
    init_state_src = lstm_encode.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([embed[x]
                                            for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mu_tweet = dy.parameter(W_mu_tweet_p)
    V_mu_tweet = dy.parameter(V_mu_tweet_p)
    b_mu_tweet = dy.parameter(b_mu_tweet_p)

    W_sig_tweet = dy.parameter(W_sig_tweet_p)
    V_sig_tweet = dy.parameter(V_sig_tweet_p)
    b_sig_tweet = dy.parameter(b_sig_tweet_p)

    # Compute tweet encoding
    mu_tweet = dy.dropout(mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet),
                          DROPOUT)
    log_var_tweet = dy.dropout(
        mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet), DROPOUT)

    W_mu_tag = dy.parameter(W_mu_tag_p)
    V_mu_tag = dy.parameter(V_mu_tag_p)
    b_mu_tag = dy.parameter(b_mu_tag_p)

    W_sig_tag = dy.parameter(W_sig_tag_p)
    V_sig_tag = dy.parameter(V_sig_tag_p)
    b_sig_tag = dy.parameter(b_sig_tag_p)

    # Compute tag encoding
    tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags), )),
                                        (NUM_TAGS, ))

    mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag),
                        DROPOUT)
    log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag),
                             DROPOUT)

    # Combine encodings for mean and diagonal covariance
    W_mu = dy.parameter(W_mu_p)
    b_mu = dy.parameter(b_mu_p)

    W_sig = dy.parameter(W_sig_p)
    b_sig = dy.parameter(b_sig_p)

    # Slowly phase out getting both inputs
    if random.random() < epsilon:
        mask = dy.zeros(HIDDEN_DIM)
    else:
        mask = dy.ones(HIDDEN_DIM)

    if random.random() < 0.5:
        mu_tweet = dy.cmult(mu_tweet, mask)
        log_var_tweet = dy.cmult(log_var_tweet, mask)
    else:
        mu_tag = dy.cmult(mu_tag, mask)
        log_var_tag = dy.cmult(log_var_tag, mask)

    mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])])
    log_var = dy.affine_transform(
        [b_sig, W_sig,
         dy.concatenate([log_var_tweet, log_var_tag])])

    # KL-Divergence loss computation
    kl_loss = -0.5 * dy.sum_elems(1 + log_var -
                                  dy.pow(mu, dy.inputVector([2])) -
                                  dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)])
    prev_word = src[0]
    W_sm = dy.parameter(W_tweet_softmax_p)
    b_sm = dy.parameter(b_tweet_softmax_p)

    for next_word in src[1:]:
        # feed the current state into the

        current_state = current_state.add_input(embed[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])

        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        # Slowly phase out teacher forcing (this may be slow??)
        if random.random() < epsilon:
            p = dy.softmax(s).npvalue()
            prev_word = np.random.choice(VOCAB_SIZE, p=p / p.sum())
        else:
            prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    W_hidden = dy.parameter(W_hidden_p)
    b_hidden = dy.parameter(b_hidden_p)

    W_out = dy.parameter(W_tag_output_p)
    b_out = dy.parameter(b_tag_output_p)

    h = dy.dropout(dy.tanh(b_hidden + W_hidden * z), DROPOUT)
    o = dy.logistic(b_out + W_out * h)

    crossentropy_loss = dy.binary_log_loss(o, tags_tensor)

    return kl_loss, softmax_loss, crossentropy_loss
Exemplo n.º 18
0
    def transduce(self, h_below: 'expression_seqs.ExpressionSequence', h_above,
                  z_below) -> 'expression_seqs.ExpressionSequence':
        if self.c == None:
            self.c = dy.zeroes(
                dim=(self.hidden_dim,
                     ))  #?? does (hidden,) take care of batch_size?
        if self.h == None:
            self.h = dy.zeroes(dim=(self.hidden_dim, ))
        if self.z == None:
            self.z = dy.ones(dim=(1, ))

        W_1l_r = dy.parameter(self.p_W_1l_r)
        bias = dy.parameter(self.p_bias)
        h = dy.parameter(self.h)

        s_recur = W_1l_r * h  #matrix multiply is *, element-wise is dy.cmult. CURRERROR: stale expression
        if not self.last_layer:
            W_2l_td = dy.parameter(self.p_W_2l_td)
            W_0l_bu = dy.parameter(self.p_W_0l_bu)
            s_bottomup = W_0l_bu * h_below  #?? this is becoming (2049,). does it need to be (2049,1) to do scalar * matrix?
            s_topdown = W_2l_td * h_above
        else:
            s_topdown = dy.zeroes(
                s_recur.dim()[0][0],
            )  #?? this gets the shape e.g. ((5, 1), 1). do i actually want batch_size as well?
            s_bottomup = W_1l_r * h
        s_bottomup = dy.cmult(
            z_below, s_bottomup
        )  #to handle batched scalar * matrix -> e.g. (1x10, 2049x10)
        s_topdown = dy.cmult(
            self.z, s_topdown
        )  #will be zeros if last_layer. is this right, or should z=1 in this case ??

        fslice = s_recur + s_topdown + s_bottomup + bias  #?? checkme. bias has same shape as s_recur et al? [4*hidden+1, batch_size]?

        i_ft = dy.pick_range(fslice, 0, self.hidden_dim)
        i_it = dy.pick_range(fslice, self.hidden_dim, self.hidden_dim * 2)
        i_ot = dy.pick_range(fslice, self.hidden_dim * 2, self.hidden_dim * 3)
        i_gt = dy.pick_range(fslice, self.hidden_dim * 3, self.hidden_dim * 4)
        f_t = dy.logistic(
            i_ft + 1.0
        )  #+1.0 bc a paper said it was better to init that way (matthias)
        i_t = dy.logistic(i_it)
        o_t = dy.logistic(i_ot)
        g_t = dy.tanh(i_gt)

        #z * normal_update + (1-z)*copy: ie, when z_below is 0, z_new = z (copied prev timestamp). when z_below is 1, z_new = dy.round etc

        #hier = True
        #        z_tmp = dy.pick_range(fslice, self.hidden_dim*4,self.hidden_dim*4+1)
        #        z_tilde = dy.logistic(z_tmp)  #original: hard sigmoid + slope annealing (a)
        #        z_new = dy.cmult(1-z_below, self.z) + dy.cmult(z_below, dy.round(z_tilde, gradient_mode="straight_through_gradient"))

        #hier = False
        z_tmp = dy.pick_range(fslice, self.hidden_dim * 4,
                              self.hidden_dim * 4 + 1)
        z_tilde = dy.logistic(
            z_tmp)  #original: hard sigmoid + slope annealing (a)
        z_new = dy.round(
            z_tilde, gradient_mode="straight_through_gradient"
        )  #use straight-through estimator for gradient: step fn forward, hard sigmoid backward

        #z = z_l,t-1
        #z_below = z_l-1,t

        #        if self.z.value() == 1: #FLUSH
        #            c_new = dy.cmult(i_t, g_t)
        #            h_new = dy.cmult(o_t, dy.tanh(c_new))
        #        elif z_below.value() == 0: #COPY

        # if flush removed, only copy or normal update
        # when z_below is 0, c_new and h_new are self.c and self.h. when z_below is 1, c_new, h_new = normal update
        c_new = dy.cmult((1 - z_below), self.c) + dy.cmult(
            z_below, (dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t)))
        h_new = dy.cmult((1 - z_below), self.h) + dy.cmult(
            z_below, dy.cmult(o_t, dy.tanh(c_new)))

        #        if z_below.value() == 0: #COPY
        #            c_new = self.c
        #            h_new = self.h
        #        else: #UPDATE
        #            c_new = dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t)
        #            h_new = dy.cmult(o_t, dy.tanh(c_new))

        self.c = c_new
        self.h = h_new
        self.z = z_new

        return h_new, z_new
Exemplo n.º 19
0
    def transduce(
        self, xs: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':
        batch_size = xs[0][0].dim()[1]
        h_bot = []
        h_mid = []
        h_top = []
        z_bot = []
        z_mid = []
        z_top = []

        self.top_layer.h = None
        self.top_layer.c = None
        self.top_layer.z = None
        self.mid_layer.h = None
        self.mid_layer.c = None
        self.mid_layer.z = None
        self.bottom_layer.h = None
        self.bottom_layer.c = None
        self.bottom_layer.z = None

        #?? checkme. want to init z to ones? (cherry paper)
        z_one = dy.ones(1, batch_size=batch_size)
        h_bot.append(
            dy.zeroes(dim=(self.hidden_dim, ),
                      batch_size=batch_size))  #indices for timesteps are +1
        h_mid.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size))
        h_top.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size))

        for i, x_t in enumerate(xs):
            h_t_bot, z_t_bot = self.bottom_layer.transduce(
                h_below=x_t, h_above=h_mid[i], z_below=z_one
            )  #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell)
            h_t_mid, z_t_mid = self.mid_layer.transduce(
                h_below=h_t_bot, h_above=h_top[i], z_below=z_t_bot
            )  #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell)
            h_t_top, z_t_top = self.top_layer.transduce(
                h_below=h_t_mid, h_above=None, z_below=z_t_mid
            )  #uses z_t_bot and h_t_bot from previous layer call, h_t_top and z_t_top from previous time step (saved in hmlstmcell)

            h_bot.append(h_t_bot)
            z_bot.append(z_t_bot)
            h_mid.append(h_t_mid)
            z_mid.append(z_t_mid)
            h_top.append(h_t_top)
            z_top.append(z_t_top)

#        #gated output module
#
#        #sigmoid
#        W_layer = dy.parameters(dim=(len(self.modules), hidden_dim)) #needs to be moved to init? num layers by hidden_dim
#        h_cat   = dy.transpose(dy.concatenate([h_bot, h_mid, h_top]))
#        dotted  = dy.dot_product(e1, e2)
#        gates   = dy.logistic(dotted)
#        #relu
#
#        om = dy.relu()

#final state is last hidden state from top layer
        self._final_states = [transducers.FinalTransducerState(h_top[-1])]
        fin_xs = expression_seqs.ExpressionSequence(expr_list=h_top[1:])
        return fin_xs  #removes the init zeros to make it same length as seq
Exemplo n.º 20
0
 def test_set_s(self):
     dy.renew_cg()
     init_s = [dy.ones(10), dy.ones(10), dy.ones(10), dy.ones(10)]
     state = self.rnn.initial_state()
     state.set_s(init_s)
Exemplo n.º 21
0
    def decode(self,
               vectors,
               tag_vectors,
               output,
               lang_id,
               weight,
               teacher_prob=1.0):
        output = [self.EOS] + list(output) + [self.EOS]
        output = [self.char2int[c] for c in output]

        N = len(vectors)

        input_mat = dy.concatenate_cols(vectors)
        w1dt = None
        input_mat = dy.dropout(input_mat, self.DROPOUT_PROB)

        tag_input_mat = dy.concatenate_cols(tag_vectors)
        tag_w1dt = None

        last_output_embeddings = self.output_lookup[self.char2int[self.EOS]]
        s = self.dec_lstm.initial_state().add_input(
            dy.concatenate(
                [vectors[-1], tag_vectors[-1], last_output_embeddings]))
        loss = []
        prev_att = dy.zeros(5)

        if self.USE_ATT_REG:
            total_att = dy.zeros(N)
        if self.USE_TAG_ATT_REG:
            total_tag_att = dy.zeros(len(tag_vectors))

        for char in output:
            # w1dt can be computed and cached once for the entire decoding phase
            w1dt = w1dt or self.attention_w1 * input_mat
            tag_w1dt = tag_w1dt or self.tag_attention_w1 * tag_input_mat

            state = dy.concatenate(list(s.s()))

            tag_att_weights = self.attend_tags(state, tag_w1dt)
            tag_context = tag_input_mat * tag_att_weights

            tag_context2 = dy.concatenate([tag_context, tag_context])

            new_state = state + tag_context2

            att_weights = self.attend_with_prev(new_state, w1dt, prev_att)
            context = input_mat * att_weights
            best_ic = np.argmax(att_weights.vec_value())
            context = input_mat * att_weights
            startt = min(best_ic - 2, N - 6)
            if startt < 0:
                startt = 0
            endd = startt + 5

            if N < 5:
                prev_att = dy.concatenate([att_weights] + [dy.zeros(1)] *
                                          (5 - N))
            else:
                prev_att = att_weights[startt:endd]
            #if prev_att.dim()[0][0] != 5:
            #   print(prev_att.dim())

            if self.USE_ATT_REG:
                total_att = total_att + att_weights
            if self.USE_TAG_ATT_REG:
                total_tag_att = total_tag_att + tag_att_weights

            vector = dy.concatenate(
                [context, tag_context, last_output_embeddings])
            s = s.add_input(vector)

            s_out = dy.dropout(s.output(), self.DROPOUT_PROB)

            out_vector = self.decoder_w * s_out + self.decoder_b
            probs = dy.softmax(out_vector)
            if teacher_prob == 1:
                last_output_embeddings = self.output_lookup[char]
            else:
                if random() > teacher_prob:
                    out_char = np.argmax(probs.npvalue())
                    last_output_embeddings = self.output_lookup[out_char]
                else:
                    last_output_embeddings = self.output_lookup[char]

            loss.append(-dy.log(dy.pick(probs, char)))
        loss = dy.esum(loss) * weight

        if self.PREDICT_LANG:
            last_enc_state = vectors[-1]
            adv_state = dy.flip_gradient(last_enc_state)
            pred_lang = dy.transpose(
                dy.transpose(adv_state) * self.lang_class_w)
            lang_probs = dy.softmax(pred_lang)
            lang_loss_1 = -dy.log(dy.pick(lang_probs, lang_id))

            first_enc_state = vectors[0]
            adv_state2 = dy.flip_gradient(first_enc_state)
            pred_lang2 = dy.transpose(
                dy.transpose(adv_state2) * self.lang_class_w)
            lang_probs2 = dy.softmax(pred_lang2)
            lang_loss_2 = -dy.log(dy.pick(lang_probs2, lang_id))
            loss += lang_loss_1 + lang_loss_2

        if self.USE_ATT_REG:
            loss += dy.huber_distance(dy.ones(N), total_att)
        if self.USE_TAG_ATT_REG:
            loss += dy.huber_distance(dy.ones(len(tag_vectors)), total_tag_att)

        return loss
Exemplo n.º 22
0
 def l2_normalize(x):
     epsilon = np.finfo(float).eps * dy.ones(pred.dim()[0])
     norm = dy.sqrt(dy.sum_elems(dy.square(x)))
     sign = dy.cdiv(x, dy.bmax(dy.abs(x), epsilon))
     return dy.cdiv(dy.cmult(sign, dy.bmax(dy.abs(x), epsilon)), dy.bmax(norm, epsilon[0]))
Exemplo n.º 23
0
    def __call__(self, x: dy.Expression, att_mask: np.ndarray,
                 batch_mask: np.ndarray, p: numbers.Real):
        """
    x: expression of dimensions (input_dim, time) x batch
    att_mask: numpy array of dimensions (time, time); pre-transposed
    batch_mask: numpy array of dimensions (batch, time)
    p: dropout prob
    """
        sent_len = x.dim()[0][1]
        batch_size = x[0].dim()[1]

        if self.downsample_factor > 1:
            if sent_len % self.downsample_factor != 0:
                raise ValueError(
                    "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. "
                    "Configure batcher accordingly.")
            if batch_mask is not None:
                batch_mask = batch_mask[:, ::self.downsample_factor]
            sent_len_out = sent_len // self.downsample_factor
            sent_len = sent_len_out
            out_mask = x.mask
            if self.downsample_factor > 1 and out_mask is not None:
                out_mask = out_mask.lin_subsampled(
                    reduce_factor=self.downsample_factor)

            x = ExpressionSequence(expr_tensor=dy.reshape(
                x.as_tensor(), (x.dim()[0][0] * self.downsample_factor,
                                x.dim()[0][1] / self.downsample_factor),
                batch_size=batch_size),
                                   mask=out_mask)
            residual = SAAMTimeDistributed()(x)
        else:
            residual = SAAMTimeDistributed()(x)
            sent_len_out = sent_len
        if self.model_dim != self.input_dim * self.downsample_factor:
            residual = self.res_shortcut.transform(residual)

        # Concatenate all the words together for doing vectorized affine transform
        if self.kq_pos_encoding_type is None:
            kvq_lin = self.linear_kvq.transform(SAAMTimeDistributed()(x))
            key_up = self.shape_projection(
                dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            value_up = self.shape_projection(
                dy.pick_range(kvq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head,
                              3 * self.head_count * self.dim_per_head),
                batch_size)
        else:
            assert self.kq_pos_encoding_type == "embedding"
            encoding = self.kq_positional_embedder.embed_sent(
                sent_len).as_tensor()
            kq_lin = self.linear_kq.transform(SAAMTimeDistributed()(
                ExpressionSequence(
                    expr_tensor=dy.concatenate([x.as_tensor(), encoding]))))
            key_up = self.shape_projection(
                dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            v_lin = self.linear_v.transform(SAAMTimeDistributed()(x))
            value_up = self.shape_projection(v_lin, batch_size)

        if self.cross_pos_encoding_type:
            assert self.cross_pos_encoding_type == "embedding"
            emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0,
                                 sent_len)
            emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0,
                                 sent_len)
            key_up = dy.reshape(key_up,
                                (sent_len, self.dim_per_head, self.head_count),
                                batch_size=batch_size)
            key_up = dy.concatenate_cols(
                [dy.cmult(key_up, emb1),
                 dy.cmult(key_up, emb2)])
            key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2),
                                batch_size=self.head_count * batch_size)
            query_up = dy.reshape(
                query_up, (sent_len, self.dim_per_head, self.head_count),
                batch_size=batch_size)
            query_up = dy.concatenate_cols(
                [dy.cmult(query_up, emb2),
                 dy.cmult(query_up, -emb1)])
            query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2),
                                  batch_size=self.head_count * batch_size)

        scaled = query_up * dy.transpose(
            key_up / math.sqrt(self.dim_per_head)
        )  # scale before the matrix multiplication to save memory

        # Apply Mask here
        if not self.ignore_masks:
            if att_mask is not None:
                att_mask_inp = att_mask * -100.0
                if self.downsample_factor > 1:
                    att_mask_inp = att_mask_inp[::self.downsample_factor, ::
                                                self.downsample_factor]
                scaled += dy.inputTensor(att_mask_inp)
            if batch_mask is not None:
                # reshape (batch, time) -> (time, head_count*batch), then *-100
                inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :],
                                                (sent_len, self.head_count, batch_size)),
                                (1, sent_len, self.head_count * batch_size)) \
                      * -100
                mask_expr = dy.inputTensor(inp, batched=True)
                scaled += mask_expr
            if self.diag_gauss_mask:
                diag_growing = np.zeros((sent_len, sent_len, self.head_count))
                for i in range(sent_len):
                    for j in range(sent_len):
                        diag_growing[i, j, :] = -(i - j)**2 / 2.0
                e_diag_gauss_mask = dy.inputTensor(diag_growing)
                e_sigma = dy.parameter(self.diag_gauss_mask_sigma)
                if self.square_mask_std:
                    e_sigma = dy.square(e_sigma)
                e_sigma_sq_inv = dy.cdiv(
                    dy.ones(e_sigma.dim()[0], batch_size=batch_size),
                    dy.square(e_sigma))
                e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask,
                                                   e_sigma_sq_inv)
                scaled += dy.reshape(e_diag_gauss_mask_final,
                                     (sent_len, sent_len),
                                     batch_size=batch_size * self.head_count)

        # Computing Softmax here.
        attn = dy.softmax(scaled, d=1)
        if LOG_ATTENTION:
            yaml_logger.info({
                "key": "selfatt_mat_ax0",
                "value": np.average(attn.value(), axis=0).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1",
                "value": np.average(attn.value(), axis=1).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax0_ent",
                "value": entropy(attn.value()).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1_ent",
                "value": entropy(attn.value().transpose()).dumps(),
                "desc": self.desc
            })

        self.select_att_head = 0
        if self.select_att_head is not None:
            attn = dy.reshape(attn, (sent_len, sent_len, self.head_count),
                              batch_size=batch_size)
            sel_mask = np.zeros((1, 1, self.head_count))
            sel_mask[0, 0, self.select_att_head] = 1.0
            attn = dy.cmult(attn, dy.inputTensor(sel_mask))
            attn = dy.reshape(attn, (sent_len, sent_len),
                              batch_size=self.head_count * batch_size)

        # Applying dropout to attention
        if p > 0.0:
            drop_attn = dy.dropout(attn, p)
        else:
            drop_attn = attn

        # Computing weighted attention score
        attn_prod = drop_attn * value_up

        # Reshaping the attn_prod to input query dimensions
        out = dy.reshape(attn_prod,
                         (sent_len_out, self.dim_per_head * self.head_count),
                         batch_size=batch_size)
        out = dy.transpose(out)
        out = dy.reshape(out, (self.model_dim, ),
                         batch_size=batch_size * sent_len_out)
        #     out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out)

        if self.plot_attention:
            from sklearn.metrics.pairwise import cosine_similarity
            assert batch_size == 1
            mats = []
            for i in range(attn.dim()[1]):
                mats.append(dy.pick_batch_elem(attn, i).npvalue())
                self.plot_att_mat(
                    mats[-1], "{}.sent_{}.head_{}.png".format(
                        self.plot_attention, self.plot_attention_counter, i),
                    300)
            avg_mat = np.average(mats, axis=0)
            self.plot_att_mat(
                avg_mat,
                "{}.sent_{}.head_avg.png".format(self.plot_attention,
                                                 self.plot_attention_counter),
                300)
            cosim_before = cosine_similarity(x.as_tensor().npvalue().T)
            self.plot_att_mat(
                cosim_before, "{}.sent_{}.cosim_before.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            cosim_after = cosine_similarity(out.npvalue().T)
            self.plot_att_mat(
                cosim_after, "{}.sent_{}.cosim_after.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            self.plot_attention_counter += 1

        # Adding dropout and layer normalization
        if p > 0.0:
            res = dy.dropout(out, p) + residual
        else:
            res = out + residual
        ret = self.layer_norm.transform(res)
        return ret
Exemplo n.º 24
0
    def cal_scores(self, src_encodings, predict=False):

        src_len = len(src_encodings)
        src_encodings = dy.concatenate_cols(
            src_encodings)  # src_ctx_dim, src_len, batch_size
        batch_size = src_encodings.dim()[1]

        W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head)
        b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head)
        W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep)
        b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep)

        W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head)
        b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head)
        W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep)
        b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep)

        U_arc_1 = dy.parameter(self.U_arc_1)
        u_arc_2 = dy.parameter(self.u_arc_2)

        U_label_1 = [dy.parameter(x) for x in self.U_label_1]
        u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1]
        u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2]
        b_label = [dy.parameter(x) for x in self.b_label]

        if predict:
            h_arc_head = self.leaky_ReLu(
                dy.affine_transform([
                    b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings
                ]))  # n_arc_ml_units, src_len, bs
            h_arc_dep = self.leaky_ReLu(
                dy.affine_transform(
                    [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings]))
            h_label_head = self.leaky_ReLu(
                dy.affine_transform([
                    b_label_hidden_to_head, W_label_hidden_to_head,
                    src_encodings
                ]))
            h_label_dep = self.leaky_ReLu(
                dy.affine_transform([
                    b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings
                ]))
        else:

            src_encodings = dy.dropout_dim(src_encodings, 1,
                                           self.arc_mlp_dropout)

            h_arc_head = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_arc_hidden_to_head, W_arc_hidden_to_head,
                        src_encodings
                    ])), 1,
                self.arc_mlp_dropout)  # n_arc_ml_units, src_len, bs
            h_arc_dep = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings
                    ])), 1, self.arc_mlp_dropout)
            h_label_head = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_label_hidden_to_head, W_label_hidden_to_head,
                        src_encodings
                    ])), 1, self.label_mlp_dropout)
            h_label_dep = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_label_hidden_to_dep, W_label_hidden_to_dep,
                        src_encodings
                    ])), 1, self.label_mlp_dropout)

        h_arc_head_transpose = dy.transpose(h_arc_head)
        h_label_head_transpose = dy.transpose(h_label_head)

        s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep,
                                                      u_arc_2)

        s_label = []
        for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2,
                                        b_label):
            e1 = h_label_head_transpose * U_1 * h_label_dep
            e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len))
            e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep
            s_label.append(e1 + e2 + e3 + b)
        return s_arc, s_label
Exemplo n.º 25
0
 def attend(self, context, x):
     context_emb = dy.esum(context)
     weights = dy.softmax(dy.ones((len(context), )))
     return context_emb, weights
Exemplo n.º 26
0
 def l2_normalize(x):
     square_sum = dynet.sqrt(dynet.bmax(dynet.sum_elems(dynet.square(x)), np.finfo(float).eps * dynet.ones((1))[0]))
     return dynet.cdiv(x, square_sum)
Exemplo n.º 27
0
def layer_norm(xs):
    head_shape, batch_size = xs[0].dim()
    g = dy.ones(head_shape)
    b = dy.zeros(head_shape)
    return [dy.layer_norm(x, g, b) for x in xs]
Exemplo n.º 28
0
    def get_rlstm_output(self, hypothesis, word2int, P_mat_in, prem_seq_len,
                         improvement):

        lookup = self.params["lookup"]  # get lookup parameters
        hypo_seq = [lookup[word2int.get(i)]
                    for i in hypothesis]  # get embeddings of each word

        # get initial state
        fw_s0 = self.fw_hypo_builder.initial_state()
        bw_s0 = self.bw_hypo_builder.initial_state()

        # will get the last state each time
        fw_s = fw_s0
        bw_s = bw_s0

        # get fw parameter expressions
        fw_At_prev = dy.parameter(self.params["fw_A_t0"])
        fw_Wp = dy.parameter(self.params["fw_Wp"])
        fw_Wm = dy.parameter(self.params["fw_Wm"])
        fw_Wc = dy.parameter(self.params["fw_Wc"])
        fw_Walpha = dy.parameter(self.params["fw_Walpha"])
        bw_At_prev = dy.parameter(self.params["bw_A_t0"])
        bw_Wp = dy.parameter(self.params["bw_Wp"])
        bw_Wm = dy.parameter(self.params["bw_Wm"])
        bw_Wc = dy.parameter(self.params["bw_Wc"])
        bw_Walpha = dy.parameter(self.params["bw_Walpha"])

        # create mask for the attend vector to take into account only the length of the current sequence
        if prem_seq_len < self.max_seq_len:
            mask = dy.concatenate([
                dy.ones(prem_seq_len),
                dy.zeros(self.max_seq_len - prem_seq_len)
            ])
            # bw_mask = dy.concatenate([dy.zeros(self.max_seq_len-prem_seq_len), dy.ones(prem_seq_len)])
        else:
            mask = dy.ones(prem_seq_len)
            # bw_mask = dy.ones(prem_seq_len)

        # calculate forward & backward mask
        At_mask_fw = dy.cmult(fw_At_prev, mask)
        At_mask_bw = dy.cmult(bw_At_prev, mask)

        if improvement == "2" or improvement == "3":
            if prem_seq_len < self.max_seq_len:
                bw_mask = dy.concatenate([
                    dy.zeros(self.max_seq_len - prem_seq_len),
                    dy.ones(prem_seq_len)
                ])
            else:
                bw_mask = dy.ones(prem_seq_len)
            At_mask_bw = dy.cmult(bw_At_prev, bw_mask)

        if improvement == "2" or improvement == "3":
            P_mat = P_mat_in[0]
        else:
            P_mat = P_mat_in

        idx = 0
        fw_output_vec = []
        # calculate the new output with the attention of the fw lstm
        for word in hypo_seq:
            fw_s = fw_s.add_input(word)  # add input to the network
            h_t = fw_s.h()[0]  # get the output vector of the current timestep

            # get the output gate value:
            Weights = self.fw_hypo_builder.get_parameter_expressions()
            Wox = dy.select_rows(
                Weights[0][0], range(self.params_size * 2,
                                     self.params_size * 3))
            Woh = dy.select_rows(
                Weights[0][1], range(self.params_size * 2,
                                     self.params_size * 3))
            bo = dy.select_rows(
                Weights[0][2], range(self.params_size * 2,
                                     self.params_size * 3))
            if idx == 0:
                out_gate = dy.logistic(Wox * word + bo)
            else:
                h_t_prev = fw_s.prev().h()[0]
                out_gate = dy.logistic(Wox * word + Woh * h_t_prev + bo)

            # matrix multiplication - [params_size x max_len_seq] x [max_len_seq x 1]
            # m dim: params_size x 1
            mt = P_mat * At_mask_fw

            # get the new out vector
            m_gated = dy.cmult(dy.tanh(mt), out_gate)
            h_t_new = h_t + m_gated
            fw_output_vec.append(h_t_new)

            # calculate alpha
            alpha = dy.colwise_add(fw_Wp * P_mat, fw_Wm * mt)
            if idx > 0:
                s_t_prev = fw_s.prev().s()[0]
                alpha = dy.colwise_add(alpha, fw_Wc * s_t_prev)

            if improvement == "1" or improvement == "3":
                alpha = dy.tanh(alpha)

            # compute the next At
            At_fw = dy.transpose(dy.transpose(fw_Walpha) * alpha)
            At_fw_exp = dy.exp(At_fw)
            At_fw_exp_mask = dy.cmult(At_fw_exp, mask)
            At_mask_fw = dy.cdiv(At_fw_exp_mask, dy.sum_elems(At_fw_exp_mask))
            idx += 1

        if improvement == "2" or improvement == "3":
            P_mat = P_mat_in[1]
        else:
            P_mat = P_mat_in

        idx = 0
        bw_output_vec = []
        # calculate the new output with the attention of the bw lstm
        for word in reversed(hypo_seq):
            bw_s = bw_s.add_input(word)  # add input to the network
            h_t = bw_s.h()[0]  # get the output vector of the current timestep

            # get the output gate value:
            Weights = self.bw_hypo_builder.get_parameter_expressions()
            Wox = dy.select_rows(
                Weights[0][0], range(self.params_size * 2,
                                     self.params_size * 3))
            Woh = dy.select_rows(
                Weights[0][1], range(self.params_size * 2,
                                     self.params_size * 3))
            bo = dy.select_rows(
                Weights[0][2], range(self.params_size * 2,
                                     self.params_size * 3))
            if idx == 0:
                out_gate = dy.logistic(Wox * word + bo)
            else:
                h_t_prev = bw_s.prev().h()[0]
                out_gate = dy.logistic(Wox * word + Woh * h_t_prev + bo)

            # matrix multiplication - [params_size x max_len_seq] x [max_len_seq x 1]
            # m dim: params_size x 1
            mt = P_mat * At_mask_bw

            # get the new out vector
            m_gated = dy.cmult(dy.tanh(mt), out_gate)
            h_t_new = h_t + m_gated
            bw_output_vec.append(h_t_new)

            # calculate alpha
            alpha = dy.colwise_add(bw_Wp * P_mat, bw_Wm * mt)
            if idx > 0:
                s_t_prev = bw_s.prev().s()[0]
                alpha = dy.colwise_add(alpha, bw_Wc * s_t_prev)

            if improvement == "1" or improvement == "3":
                alpha = dy.tanh(alpha)

            # compute the next At
            At_bw = dy.transpose(dy.transpose(bw_Walpha) * alpha)
            At_bw_exp = dy.exp(At_bw)
            At_bw_exp_mask = dy.cmult(At_bw_exp, mask)
            At_mask_bw = dy.cdiv(At_bw_exp_mask, dy.sum_elems(At_bw_exp_mask))
            idx += 1

        return fw_output_vec, bw_output_vec