예제 #1
0
 def copy_src_probs_pick(token_type, token_literal):
     if token_type not in copy_atts:
         return dy.scalarInput(0.0)
     selected_indexes = copy_history[token_type][token_literal]
     if len(selected_indexes) == 0:
         return dy.scalarInput(0.0)
     probs = copy_src_probs(token_type)
     return dy.sum_elems(dy.select_rows(probs, selected_indexes))
예제 #2
0
 def embed_word(self, word):
     if self.tied:
         word_embs = self.final_mlp.layers[-1].w
         word_emb = dy.select_rows(word_embs, [word])
         word_emb = dy.transpose(word_emb)
     else:
         word_emb = dy.lookup(self.word_embs, word)
     return word_emb
예제 #3
0
 def copy_src_probs_map(token_type, lazy=False):
     if token_type not in copy_atts:
         return {}
     literal_history = copy_history[token_type]
     if all(len(history) == 0 for history in literal_history.values()):
         return {}
     probs = copy_src_probs(token_type)
     if lazy:
         return {
             literal: dy.sum_elems(dy.select_rows(probs, history))
             for literal, history in literal_history.items()
             if len(history) > 0
         }
     return {
         literal: dy.sum_elems(dy.select_rows(probs, history)).value()
         for literal, history in literal_history.items()
         if len(history) > 0
     }
예제 #4
0
 def cal_context(self, s, selected=None):
     ws = self.cal_scores(s)
     if selected is None:
         return self.es_matrix * ws, ws
     selected_ws = dy.select_rows(ws, selected)
     selected_ws = dy.cdiv(selected_ws,
                           dy.sum_elems(selected_ws))
     return dy.concatenate_cols(
         [es[index]
          for index in selected]) * selected_ws, ws
예제 #5
0
def split(x, dim=1):
    head_shape, batch_size = x.dim()
    res = []
    if dim == 0:
        for i in range(head_shape[0]):
            res.append(dy.select_rows(x, [i]))
    elif dim == 1:

        for i in range(head_shape[1]):
            res.append(dy.select_cols(x, [i]))
    return res
예제 #6
0
    def embed_word(self, word):
        if self.tied:
            word_embs = self.output_mlp.layers[-1].w
            word_emb = dy.select_rows(word_embs, [word])
            word_emb = dy.transpose(word_emb)
        else:
            word_emb = dy.lookup(self.word_embs, word)

        # Normalize word vectors to have length one
        #word_emb_norm = dy.pow(dy.dot_product(word_emb, word_emb), self.exp)
        #word_emb = word_emb * word_emb_norm
        return word_emb
예제 #7
0
def _lm_model_step(m, beam_indices, tag_indices):
    m["beam_lm_states"] = [
        m["beam_lm_states"][b_idx].add_input(t2e[t_idx])
        for (b_idx, t_idx) in izip(beam_indices, tag_indices)
    ]
    m["beam_lm_hs"] = dy.concatenate_cols(
        [x.output() for x in m["beam_lm_states"]])
    m["idx"] = m["idx"] + 1

    if cfg["accumulate_scores"]:
        beam_size_prev, num_tags = m["scores"].dim()[0]
        scores_flat = dy.reshape(m["scores"], (beam_size_prev * num_tags, 1))
        m["acc_scores"] = dy.select_rows(
            scores_flat, beam_indices + tag_indices * beam_size_prev)
예제 #8
0
 def from_input_prob(self, selected_indexes, neg=False):
     assert type(selected_indexes) == set
     selected_indexes = [
         index for index, old_index in enumerate(input_num_indexes)
         if old_index in selected_indexes
     ]
     if len(selected_indexes) == 0:
         return dy.scalarInput(0.0), dy.zeros(decoder.arg_dim)
     signed_h = self._signed_h(neg=neg)
     input_ref, probs = input_atts.cal_context(
         signed_h, selected_indexes)
     with parameters(decoder.neg_input_embed,
                     decoder.pos_input_embed) as (neg_input_embed,
                                                  pos_input_embed):
         input_ref = dy.concatenate([
             input_ref, neg_input_embed if neg else pos_input_embed
         ])
     return dy.sum_elems(dy.select_rows(
         probs, selected_indexes)), input_ref
예제 #9
0
 def from_exprs_prob(self, selected_indexes, neg=False):
     assert type(selected_indexes) == set
     selected_indexes = [
         index
         for index, old_index in enumerate(self.exprs_num_indexs)
         if old_index in selected_indexes
     ]
     if len(selected_indexes) == 0:
         return dy.scalarInput(0.0), dy.zeros(decoder.arg_dim)
     ht = dy.tanh(decoder.h2ht(self.s.output()))
     signed_h = self._signed_h(ht, neg)
     exprs_ref, probs = self.expr_atts.cal_context(
         signed_h, selected_indexes)
     with parameters(decoder.neg_exprs_embed,
                     decoder.pos_exprs_embed) as (neg_exprs_embed,
                                                  pos_exprs_embed):
         exprs_ref = dy.concatenate([
             exprs_ref, neg_exprs_embed if neg else pos_exprs_embed
         ])
     return dy.sum_elems(dy.select_rows(
         probs, selected_indexes)), exprs_ref
예제 #10
0
파일: util.py 프로젝트: lil-lab/scone
def attend(input_vectors, state, params, dropout_amount=0.):
    """Attends on some input vectors given a state and attention parameters.

    Inputs:
        input_vectors (list of dy.Expression): Vectors to attend on.
        state (dy.Expression): A state (query).
        params (dy.Expression): Attentional weights to transform the state before
            computing attention.
        dropout_amount (float, optional): The amount of dropout to apply after
            transforming the state.

    Returns:
        dy.Expression representing the weighted sum of input vectors given the
            computed attentional weights.
    """
    projected_state = dy.transpose(
        dy.reshape(state, (1, state.dim()[0][0])) * params)
    projected_state = dy.dropout(projected_state, dropout_amount)
    attention_weights = dy.select_rows(
        dy.transpose(projected_state) * dy.concatenate_cols(input_vectors),
        [0])[0]
    context = dy.concatenate_cols(input_vectors) * dy.softmax(
        attention_weights)
    return context, dy.softmax(attention_weights)
예제 #11
0
    def get_rlstm_output(self, hypothesis, word2int, P_mat_in, prem_seq_len,
                         improvement):

        lookup = self.params["lookup"]  # get lookup parameters
        hypo_seq = [lookup[word2int.get(i)]
                    for i in hypothesis]  # get embeddings of each word

        # get initial state
        fw_s0 = self.fw_hypo_builder.initial_state()
        bw_s0 = self.bw_hypo_builder.initial_state()

        # will get the last state each time
        fw_s = fw_s0
        bw_s = bw_s0

        # get fw parameter expressions
        fw_At_prev = dy.parameter(self.params["fw_A_t0"])
        fw_Wp = dy.parameter(self.params["fw_Wp"])
        fw_Wm = dy.parameter(self.params["fw_Wm"])
        fw_Wc = dy.parameter(self.params["fw_Wc"])
        fw_Walpha = dy.parameter(self.params["fw_Walpha"])
        bw_At_prev = dy.parameter(self.params["bw_A_t0"])
        bw_Wp = dy.parameter(self.params["bw_Wp"])
        bw_Wm = dy.parameter(self.params["bw_Wm"])
        bw_Wc = dy.parameter(self.params["bw_Wc"])
        bw_Walpha = dy.parameter(self.params["bw_Walpha"])

        # create mask for the attend vector to take into account only the length of the current sequence
        if prem_seq_len < self.max_seq_len:
            mask = dy.concatenate([
                dy.ones(prem_seq_len),
                dy.zeros(self.max_seq_len - prem_seq_len)
            ])
            # bw_mask = dy.concatenate([dy.zeros(self.max_seq_len-prem_seq_len), dy.ones(prem_seq_len)])
        else:
            mask = dy.ones(prem_seq_len)
            # bw_mask = dy.ones(prem_seq_len)

        # calculate forward & backward mask
        At_mask_fw = dy.cmult(fw_At_prev, mask)
        At_mask_bw = dy.cmult(bw_At_prev, mask)

        if improvement == "2" or improvement == "3":
            if prem_seq_len < self.max_seq_len:
                bw_mask = dy.concatenate([
                    dy.zeros(self.max_seq_len - prem_seq_len),
                    dy.ones(prem_seq_len)
                ])
            else:
                bw_mask = dy.ones(prem_seq_len)
            At_mask_bw = dy.cmult(bw_At_prev, bw_mask)

        if improvement == "2" or improvement == "3":
            P_mat = P_mat_in[0]
        else:
            P_mat = P_mat_in

        idx = 0
        fw_output_vec = []
        # calculate the new output with the attention of the fw lstm
        for word in hypo_seq:
            fw_s = fw_s.add_input(word)  # add input to the network
            h_t = fw_s.h()[0]  # get the output vector of the current timestep

            # get the output gate value:
            Weights = self.fw_hypo_builder.get_parameter_expressions()
            Wox = dy.select_rows(
                Weights[0][0], range(self.params_size * 2,
                                     self.params_size * 3))
            Woh = dy.select_rows(
                Weights[0][1], range(self.params_size * 2,
                                     self.params_size * 3))
            bo = dy.select_rows(
                Weights[0][2], range(self.params_size * 2,
                                     self.params_size * 3))
            if idx == 0:
                out_gate = dy.logistic(Wox * word + bo)
            else:
                h_t_prev = fw_s.prev().h()[0]
                out_gate = dy.logistic(Wox * word + Woh * h_t_prev + bo)

            # matrix multiplication - [params_size x max_len_seq] x [max_len_seq x 1]
            # m dim: params_size x 1
            mt = P_mat * At_mask_fw

            # get the new out vector
            m_gated = dy.cmult(dy.tanh(mt), out_gate)
            h_t_new = h_t + m_gated
            fw_output_vec.append(h_t_new)

            # calculate alpha
            alpha = dy.colwise_add(fw_Wp * P_mat, fw_Wm * mt)
            if idx > 0:
                s_t_prev = fw_s.prev().s()[0]
                alpha = dy.colwise_add(alpha, fw_Wc * s_t_prev)

            if improvement == "1" or improvement == "3":
                alpha = dy.tanh(alpha)

            # compute the next At
            At_fw = dy.transpose(dy.transpose(fw_Walpha) * alpha)
            At_fw_exp = dy.exp(At_fw)
            At_fw_exp_mask = dy.cmult(At_fw_exp, mask)
            At_mask_fw = dy.cdiv(At_fw_exp_mask, dy.sum_elems(At_fw_exp_mask))
            idx += 1

        if improvement == "2" or improvement == "3":
            P_mat = P_mat_in[1]
        else:
            P_mat = P_mat_in

        idx = 0
        bw_output_vec = []
        # calculate the new output with the attention of the bw lstm
        for word in reversed(hypo_seq):
            bw_s = bw_s.add_input(word)  # add input to the network
            h_t = bw_s.h()[0]  # get the output vector of the current timestep

            # get the output gate value:
            Weights = self.bw_hypo_builder.get_parameter_expressions()
            Wox = dy.select_rows(
                Weights[0][0], range(self.params_size * 2,
                                     self.params_size * 3))
            Woh = dy.select_rows(
                Weights[0][1], range(self.params_size * 2,
                                     self.params_size * 3))
            bo = dy.select_rows(
                Weights[0][2], range(self.params_size * 2,
                                     self.params_size * 3))
            if idx == 0:
                out_gate = dy.logistic(Wox * word + bo)
            else:
                h_t_prev = bw_s.prev().h()[0]
                out_gate = dy.logistic(Wox * word + Woh * h_t_prev + bo)

            # matrix multiplication - [params_size x max_len_seq] x [max_len_seq x 1]
            # m dim: params_size x 1
            mt = P_mat * At_mask_bw

            # get the new out vector
            m_gated = dy.cmult(dy.tanh(mt), out_gate)
            h_t_new = h_t + m_gated
            bw_output_vec.append(h_t_new)

            # calculate alpha
            alpha = dy.colwise_add(bw_Wp * P_mat, bw_Wm * mt)
            if idx > 0:
                s_t_prev = bw_s.prev().s()[0]
                alpha = dy.colwise_add(alpha, bw_Wc * s_t_prev)

            if improvement == "1" or improvement == "3":
                alpha = dy.tanh(alpha)

            # compute the next At
            At_bw = dy.transpose(dy.transpose(bw_Walpha) * alpha)
            At_bw_exp = dy.exp(At_bw)
            At_bw_exp_mask = dy.cmult(At_bw_exp, mask)
            At_mask_bw = dy.cdiv(At_bw_exp_mask, dy.sum_elems(At_bw_exp_mask))
            idx += 1

        return fw_output_vec, bw_output_vec