Exemplo n.º 1
0
Arquivo: test.py Projeto: jayantk/cnn
    def test_update(self):
        ones=np.ones((10, 10))
        updated = np.ones((10, 10)) * 0.99
        gradient = np.ones((10, 10)) * 0.01

        dy.renew_cg()
        pp1 = dy.parameter(self.p1)
        pp2 = dy.parameter(self.p2)

        a = pp1 * self.lp1[1]
        b = pp2 * self.lp2[1]
        l = dy.dot_product(a, b) / 100
        self.assertEqual(l.scalar_value(),10,msg=str(l.scalar_value()))
        l.backward()

        self.assertTrue(np.allclose(self.p1.grad_as_array(), 0.1 * ones),msg=np.array_str(self.p1.grad_as_array()))
        self.assertTrue(np.allclose(self.p2.grad_as_array(), 0.1 * ones),msg=np.array_str(self.p2.grad_as_array()))
        self.assertTrue(np.allclose(self.lp1.grad_as_array()[1], ones[0]),msg=np.array_str(self.lp1.grad_as_array()))
        self.assertTrue(np.allclose(self.lp2.grad_as_array()[1], ones[0]),msg=np.array_str(self.lp2.grad_as_array()))

        self.trainer.update()



        self.assertTrue(np.allclose(self.p1.as_array(), ones * 0.99),msg=np.array_str(self.p1.as_array()))
        self.assertTrue(np.allclose(self.p2.as_array(), ones * 0.99),msg=np.array_str(self.p2.as_array()))
        self.assertTrue(np.allclose(self.lp1.as_array()[1], ones[0] * 0.9),msg=np.array_str(self.lp1.as_array()[1]))
        self.assertTrue(np.allclose(self.lp2.as_array()[1], ones[0] * 0.9),msg=np.array_str(self.lp2.as_array()))
Exemplo n.º 2
0
def generate(input, enc_fwd_lstm, enc_bwd_lstm, dec_lstm):
    def sample(probs):
        rnd = random.random()
        for i, p in enumerate(probs):
            rnd -= p
            if rnd <= 0: break
        return i

    embedded = embed_sentence(input)
    encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded)

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings]))
    out = ''
    count_EOS = 0
    for i in range(len(input)*2):
        if count_EOS == 2: break
        vector = dy.concatenate([attend(encoded, s), last_output_embeddings])

        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector)
        probs = probs.vec_value()
        next_char = sample(probs)
        last_output_embeddings = output_lookup[next_char]
        if int2char[next_char] == EOS:
            count_EOS += 1
            continue

        out += int2char[next_char]
    return out
Exemplo n.º 3
0
 def expr_for_tree(self, tree):
     if tree.isleaf():
         return self.E[self.w2i.get(tree.label,0)]
     if len(tree.children) == 1:
         assert(tree.children[0].isleaf())
         emb = self.expr_for_tree(tree.children[0])
         Wi,Wo,Wu   = [dy.parameter(w) for w in self.WS]
         bi,bo,bu,_ = [dy.parameter(b) for b in self.BS]
         i = dy.logistic(Wi*emb + bi)
         o = dy.logistic(Wo*emb + bo)
         u = dy.tanh(    Wu*emb + bu)
         c = dy.cmult(i,u)
         expr = dy.cmult(o,dy.tanh(c))
         return expr
     assert(len(tree.children) == 2),tree.children[0]
     e1 = self.expr_for_tree(tree.children[0])
     e2 = self.expr_for_tree(tree.children[1])
     Ui,Uo,Uu = [dy.parameter(u) for u in self.US]
     Uf1,Uf2 = [dy.parameter(u) for u in self.UFS]
     bi,bo,bu,bf = [dy.parameter(b) for b in self.BS]
     e = dy.concatenate([e1,e2])
     i = dy.logistic(Ui*e + bi)
     o = dy.logistic(Uo*e + bo)
     f1 = dy.logistic(Uf1*e1 + bf)
     f2 = dy.logistic(Uf2*e2 + bf)
     u = dy.tanh(    Uu*e + bu)
     c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2)
     h = dy.cmult(o,dy.tanh(c))
     expr = h
     return expr
Exemplo n.º 4
0
def decode(dec_lstm, vectors, output):
    output = [EOS] + list(output) + [EOS]
    output = [char2int[c] for c in output]

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)
    w1 = dy.parameter(attention_w1)
    input_mat = dy.concatenate_cols(vectors)
    w1dt = None

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings]))
    loss = []

    for char in output:
        # w1dt can be computed and cached once for the entire decoding phase
        w1dt = w1dt or w1 * input_mat
        vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings])
        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector)
        last_output_embeddings = output_lookup[char]
        loss.append(-dy.log(dy.pick(probs, char)))
    loss = dy.esum(loss)
    return loss
Exemplo n.º 5
0
def generate(in_seq, enc_fwd_lstm, enc_bwd_lstm, dec_lstm):
    embedded = embed_sentence(in_seq)
    encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded)

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)
    w1 = dy.parameter(attention_w1)
    input_mat = dy.concatenate_cols(encoded)
    w1dt = None

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings]))

    out = ''
    count_EOS = 0
    for i in range(len(in_seq)*2):
        if count_EOS == 2: break
        # w1dt can be computed and cached once for the entire decoding phase
        w1dt = w1dt or w1 * input_mat
        vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings])
        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector).vec_value()
        next_char = probs.index(max(probs))
        last_output_embeddings = output_lookup[next_char]
        if int2char[next_char] == EOS:
            count_EOS += 1
            continue

        out += int2char[next_char]
    return out
Exemplo n.º 6
0
def generate(sent):
    dy.renew_cg()

    src = sent


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()

    #generate until a eos tag or max is reached
    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])

    prev_word = sos_trg
    trg_sent = []
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for i in range(MAX_SENT_SIZE):
        #feed the previous word into the lstm, calculate the most likely word, add it to the sentence
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()
        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        probs = (-dy.log_softmax(s)).value()
        next_word = np.argmax(probs)

        if next_word == eos_trg:
            break
        prev_word = next_word
        trg_sent.append(i2w_trg[next_word])
    return trg_sent
Exemplo n.º 7
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()
    #now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        #feed the current state into the 
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word
    return dy.esum(all_losses)
Exemplo n.º 8
0
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component):
    w1_att_src = dy.parameter(w1_att_src_p)
    w1_att_tgt = dy.parameter(w1_att_tgt_p)
    w2_att = dy.parameter(w2_att_p)
    a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att
    alignment = dy.softmax(a_t)
    att_output = src_output_matrix * alignment
    return att_output, alignment
Exemplo n.º 9
0
 def __call__(self, obs, batched=False):
     out = self.network(obs, batched)
     W, b = dy.parameter(self.W), dy.parameter(self.b)
     As = dy.affine_transform([b, W, out])
     if self.dueling:
         W_extra, b_extra = dy.parameter(self.W_extra), dy.parameter(self.b_extra)
         V = dy.affine_transform([b_extra, W_extra, out])
         return As, V
     return As
Exemplo n.º 10
0
    def __call__(self, x):

        assert(isinstance(x, dy.Expression))

        self.W = dy.parameter(self.pW)  # add parameters to graph as expressions # m2.add_parameters((8, len(inputs)))
        self.b = dy.parameter(self.pb)
        self.x = x

        return self.W * self.x + self.b
Exemplo n.º 11
0
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output()
    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Exemplo n.º 12
0
Arquivo: dy_model.py Projeto: jcyk/CWS
    def renew_cg(self):
        # renew the compute graph for every single instance
        dy.renew_cg()

        param_exprs = dict()
        param_exprs['U'] = dy.parameter(self.params['word_score_U'])
        param_exprs['pW'] = dy.parameter(self.params['predict_W'])
        param_exprs['pb'] = dy.parameter(self.params['predict_b'])
        param_exprs['<bos>'] = dy.parameter(self.params['<BoS>'])
        self.param_exprs = param_exprs
Exemplo n.º 13
0
def calc_scores(words):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    word_reps = LSTM.transduce([LOOKUP[x] for x in words])

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)
    scores = [dy.affine_transform([b, W, x]) for x in word_reps]

    return scores
Exemplo n.º 14
0
    def __call__(self, obs, batched=False):
        out = obs if isinstance(obs, dy.Expression) else dy.inputTensor(obs, batched=batched)

        for i in range(self.n_layers):
            b, W = dy.parameter(self.bs[i]), dy.parameter(self.Ws[i])
            out = dy.affine_transform([b, W, out])
            if self.layer_norm and i != self.n_layers - 1:
                out = dy.layer_norm(out, self.ln_gs[i], self.ln_bs[i])
            if self.specified_activation:
                if self.activation[i] is not None:
                    out = self.activation[i](out)
            else:
                out = self.activation(out)
        return out
Exemplo n.º 15
0
def calc_scores_with_previous_tag(words, referent_tags=None):
    """
    Calculate scores using previous tag as input. If the referent tags are provided, then we will sample from previous
    referent tag or previous system prediction.
    :param words:
    :param referent_tags:
    :return:
    """
    dy.renew_cg()

    word_embs = [LOOKUP[x] for x in words]

    # Transduce all batch elements for the backward LSTM, using the original word embeddings.
    bwd_init = bwdLSTM.initial_state()
    bwd_word_reps = bwd_init.transduce(reversed(word_embs))

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)

    scores = []
    # Transduce one by one for the forward LSTM
    fwd_init = fwdLSTM.initial_state()
    s_fwd = fwd_init

    prev_tag = start_tag

    index = 0
    for word, bwd_word_rep in zip(word_embs, reversed(bwd_word_reps)):
        # Concatenate word and tag representation just as training.
        fwd_input = dy.concatenate([word, TAG_LOOKUP[prev_tag]])
        s_fwd = s_fwd.add_input(fwd_input)
        combined_rep = dy.concatenate([s_fwd.output(), bwd_word_rep])
        score = dy.affine_transform([b, W, combined_rep])
        prediction = np.argmax(score.npvalue())

        if referent_tags:
            if sampler.sample_true():
                prev_tag = referent_tags[index]
            else:
                prev_tag = prediction
            index += 1
        else:
            prev_tag = prediction

        scores.append(score)

    return scores
Exemplo n.º 16
0
def generate(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent

    #get the output of the first LSTM
    src_outputs =  [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])]

    src_output = src_outputs[-1]

    #gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix



    #generate until a eos tag or max is reached
    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])

    prev_word = sos_trg
    trg_sent = []
    attention_matrix = []
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)



    for i in range(MAX_SENT_SIZE):
        #feed the previous word into the lstm, calculate the most likely word, add it to the sentence
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()
        att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        attention_matrix.append(alignment)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        probs = (-dy.log_softmax(s)).value()
        next_word = np.argmax(probs)

        if next_word == eos_trg:
            break
        prev_word = next_word
        trg_sent.append(i2w_trg[next_word])
    return trg_sent, dy.concatenate_cols(attention_matrix).value()
Exemplo n.º 17
0
 def word_assoc_score(self, source_idx, target_idx, relation):
     """
     NOTE THAT DROPOUT IS BEING APPLIED HERE
     :param source_idx: embedding index of source atom
     :param target_idx: embedding index of target atom
     :param relation: relation type
     :return: score
     """
     # prepare
     s = self.embeddings[source_idx]
     if self.no_assoc:
         A = dy.const_parameter(self.word_assoc_weights[relation])
     else:
         A = dy.parameter(self.word_assoc_weights[relation])
     dy.dropout(A, self.dropout)
     t = self.embeddings[target_idx]
     
     # compute
     if self.mode == BILINEAR_MODE:
         return dy.transpose(s) * A * t
     elif self.mode == DIAG_RANK1_MODE:
         diag_A = dyagonalize(A[0])
         rank1_BC = A[1] * dy.transpose(A[2])
         ABC = diag_A + rank1_BC
         return dy.transpose(s) * ABC * t
     elif self.mode == TRANSLATIONAL_EMBED_MODE:
         return -dy.l2_norm(s - t + A)
     elif self.mode == DISTMULT:
         return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
Exemplo n.º 18
0
def attend(input_mat, state, w1dt):
    global attention_w2
    global attention_v
    w2 = dy.parameter(attention_w2)
    v = dy.parameter(attention_v)

    # input_mat: (encoder_state x seqlen) => input vecs concatenated as cols
    # w1dt: (attdim x seqlen)
    # w2dt: (attdim x attdim)
    w2dt = w2*dy.concatenate(list(state.s()))
    # att_weights: (seqlen,) row vector
    unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt)))
    att_weights = dy.softmax(unnormalized)
    # context: (encoder_state)
    context = input_mat * att_weights
    return context
Exemplo n.º 19
0
def attend(input_vectors, state):
    global attention_w1
    global attention_w2
    global attention_v
    w1 = dy.parameter(attention_w1)
    w2 = dy.parameter(attention_w2)
    v = dy.parameter(attention_v)
    attention_weights = []

    w2dt = w2*dy.concatenate(list(state.s()))
    for input_vector in input_vectors:
        attention_weight = v*dy.tanh(w1*input_vector + w2dt)
        attention_weights.append(attention_weight)
    attention_weights = dy.softmax(dy.concatenate(attention_weights))
    output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)])
    return output_vectors
Exemplo n.º 20
0
 def ergm_score(self):
     """
     :return: ERGM score (dynet Expression) computed based on ERGM weights and features only
     Does not populate any field
     """
     W = dy.parameter(self.ergm_weights)
     f = dy.transpose(dy.inputVector([self.feature_vals[k] for k in self.feature_set]))
     return f * W
Exemplo n.º 21
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]

    # initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mean = dy.parameter(W_mean_p)
    V_mean = dy.parameter(V_mean_p)
    b_mean = dy.parameter(b_mean_p)

    W_var = dy.parameter(W_var_p)
    V_var = dy.parameter(V_var_p)
    b_var = dy.parameter(b_var_p)

    # The mean vector from the encoder.
    mu = mlp(src_output, W_mean, V_mean, b_mean)
    # This is the diagonal vector of the log co-variance matrix from the encoder
    # (regard this as log variance is easier for furture implementation)
    log_var = mlp(src_output, W_var, V_var, b_var)

    # Compute KL[N(u(x), sigma(x)) || N(0, I)]
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        # feed the current state into the
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    return kl_loss, softmax_loss
Exemplo n.º 22
0
def calc_reinforce_loss(words, tags, delta):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    word_reps = LSTM.transduce([LOOKUP[x] for x in words])

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)

    #calculate the probability distribution 
    scores = [dy.affine_transform([b, W, x]) for x in word_reps]
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    probs = [-dy.exp(loss).as_array() for loss in losses]

    #then take samples from the probability distribution
    samples = [np.random.choice(range(len(x)), p=x) for x in probs]

    #calculate accuracy=reward
    correct = [sample == tag for sample, tag in zip(samples, tags)]
    r_i = float(sum(correct))/len(correct)
    r = dy.constant((1), r_i)
    # Reward baseline for each word
    W_bl = dy.parameter(W_bl_p)
    b_bl = dy.parameter(b_bl_p)
    r_b = [dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps]

    #we need to take the value in order to break the computation graph
    #as the reward portion is trained seperatley and not backpropogated through during the overall score
    rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b]
    #the scores for training the baseline
    baseline_scores = [dy.square(r - x) for x in r_b]

    #then calculate the reinforce scores using reinforce
    reinforce_scores = [r_s*score for r_s, score in zip(rewards_over_baseline, scores)]

    #we want the first len(sent)-delta scores from xent then delta scores from reinforce
    #for mixer
    if len(scores) > delta:
        mixer_scores = scores[:len(scores)-delta] + reinforce_scores[delta-1:]
    else:
        mixer_scores = reinforce_scores
    return dy.esum(mixer_scores), dy.esum(baseline_scores)
Exemplo n.º 23
0
    def BuildLMGraph(self, sents):
        dy.renew_cg()
        # initialize the RNN
        init_state = self.builder.initial_state()
        # parameters -> expressions
        R = dy.parameter(self.R)
        bias = dy.parameter(self.bias)

        S = vocab.w2i["<s>"]
        # get the cids and masks for each step
        tot_chars = 0
        cids = []
        masks = []

        for i in range(len(sents[0])):
            cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents])
            mask = [(1 if len(sent)>i else 0) for sent in sents]
            masks.append(mask)
            tot_chars += sum(mask)

        # start the rnn with "<s>"
        init_ids = cids[0]
        s = init_state.add_input(lookup_batch(self.lookup, init_ids))

        losses = []

        # feed char vectors into the RNN and predict the next char
        for cid, mask in zip(cids[1:], masks[1:]):
            score = dy.affine_transform([bias, R, s.output()])
            loss = dy.pickneglogsoftmax_batch(score, cid)
            # mask the loss if at least one sentence is shorter
            if mask[-1] != 1:
                mask_expr = dy.inputVector(mask)
                mask_expr = dy.reshape(mask_expr, (1,), len(sents))
                loss = loss * mask_expr

            losses.append(loss)
            # update the state of the RNN
            cemb = dy.lookup_batch(self.lookup, cid)
            s = s.add_input(cemb)

        return dy.sum_batches(dy.esum(losses)), tot_chars
Exemplo n.º 24
0
def build_tagging_graph(words):
    dy.renew_cg()
    # parameters -> expressions
    H = dy.parameter(pH)
    O = dy.parameter(pO)

    # initialize the RNNs
    f_init = fwdRNN.initial_state()
    b_init = bwdRNN.initial_state()

    cf_init = cFwdRNN.initial_state()
    cb_init = cBwdRNN.initial_state()

    # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
    wembs = [word_rep(w, cf_init, cb_init) for w in words]
    wembs = [dy.noise(we,0.2) for we in wembs] # optional

    # feed word vectors into biLSTM
    fw_exps = f_init.transduce(wembs)
    bw_exps = b_init.transduce(reversed(wembs))
# OR
#    fw_exps = []
#    s = f_init
#    for we in wembs:
#        s = s.add_input(we)
#        fw_exps.append(s.output())
#    bw_exps = []
#    s = b_init
#    for we in reversed(wembs):
#        s = s.add_input(we)
#        bw_exps.append(s.output())

    # biLSTM states
    bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))]

    # feed each biLSTM state to an MLP
    exps = []
    for x in bi_exps:
        r_t = O*(dy.tanh(H * x))
        exps.append(r_t)

    return exps
Exemplo n.º 25
0
def decode(dec_lstm, vectors, output):
    output = [EOS] + list(output) + [EOS]
    output = [char2int[c] for c in output]

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings]))
    loss = []
    for char in output:
        vector = dy.concatenate([attend(vectors, s), last_output_embeddings])

        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector)
        last_output_embeddings = output_lookup[char]
        loss.append(-dy.log(dy.pick(probs, char)))
    loss = dy.esum(loss)
    return loss
Exemplo n.º 26
0
def create_network_return_best(inputs):
    '''
    inputs is a list of numbers
    '''
    dy.renew_cg()
    W = dy.parameter(pW)
    b = dy.parameter(pB)

    if(len(inputs) > documentLength):
       inputs = inputs[0:documentLength]    
    
    emb_vectors = [lookup[i] for i in inputs]
    
    while(len(emb_vectors) < documentLength):
        pad = dy.vecInput(embDimension)
        pad.set(np.zeros(embDimension))
        emb_vectors.append(pad)    
    
    net_input = dy.concatenate(emb_vectors)
    net_output = dy.softmax( (W*net_input) + b)
    return np.argmax(net_output.npvalue())
Exemplo n.º 27
0
Arquivo: dy_model.py Projeto: jcyk/CWS
    def word_repr(self, char_seq):
        # obtain the word representation when given its character sequence
        wlen = len(char_seq)
        if 'rgW%d'%wlen not in self.param_exprs:
            self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1])
            self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1])
            self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1])
            self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1])
            self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1])
            self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1])
          
        chars = dy.concatenate(char_seq)
        reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen])
        comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars])
        update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen]
        
        update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])]))
        
        # The following implementation of Softmax fucntion is not safe, but faster...
        #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1)))
        #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1)))
        #assert (not np.isnan(update_gate.npvalue()).any())

        word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1))))
        return word
Exemplo n.º 28
0
 def expr_for_tree(self, tree):
     if tree.isleaf():
         return self.E[self.w2i.get(tree.label,0)]
     if len(tree.children) == 1:
         assert(tree.children[0].isleaf())
         expr = self.expr_for_tree(tree.children[0])
         return expr
     assert(len(tree.children) == 2),tree.children[0]
     e1 = self.expr_for_tree(tree.children[0])
     e2 = self.expr_for_tree(tree.children[1])
     W = dy.parameter(self.W)
     expr = dy.tanh(W*dy.concatenate([e1,e2]))
     return expr
Exemplo n.º 29
0
def create_network_return_loss(inputs, expected_output):
    '''
    inputs is a list of numbers
    '''
    dy.renew_cg()
    W = dy.parameter(pW) # from parameters to expressions
    b = dy.parameter(pB)
    
    if(len(inputs) > documentLength):
       inputs = inputs[0:documentLength]
    
    emb_vectors = [lookup[i] for i in inputs]
    
    while(len(emb_vectors) < documentLength):
        pad = dy.vecInput(embDimension)
        pad.set(np.zeros(embDimension))
        emb_vectors.append(pad)
    
    net_input = dy.concatenate(emb_vectors)
    net_output = dy.softmax( (W*net_input) + b)
    loss = -dy.log(dy.pick(net_output, expected_output))
    return loss
Exemplo n.º 30
0
    def sample(self, first=1, nchars=0, stop=-1):
        res = [first]
        dy.renew_cg()
        state = self.builder.initial_state()

        R = dy.parameter(self.R)
        bias = dy.parameter(self.bias)
        cw = first
        while True:
            x_t = dy.lookup(self.lookup, cw)
            state = state.add_input(x_t)
            y_t = state.output()
            r_t = bias + (R * y_t)
            ydist = dy.softmax(r_t)
            dist = ydist.vec_value()
            rnd = random.random()
            for i,p in enumerate(dist):
                rnd -= p
                if rnd <= 0: break
            res.append(i)
            cw = i
            if cw == stop: break
            if nchars and len(res) > nchars: break
        return res
Exemplo n.º 31
0
WEMB_DIM = 128
RNN_HIDDEN_DIM = 64
HIDDEN_DIM = 32

pWembs = model.add_lookup_parameters((num_words, WEMB_DIM))
pH = model.add_parameters((HIDDEN_DIM, RNN_HIDDEN_DIM))
pHb = model.add_parameters(HIDDEN_DIM)
pO = model.add_parameters((num_tags, HIDDEN_DIM))
pOb = model.add_parameters(num_tags)

rnn_builder = dy.BiRNNBuilder(1, WEMB_DIM, RNN_HIDDEN_DIM, model,
                              dy.LSTMBuilder)

dy.renew_cg()

H = dy.parameter(pH)
Hb = dy.parameter(pHb)
O = dy.parameter(pO)
Ob = dy.parameter(pOb)

indexed_words, indexed_gold_tags = zip(*[(w2i[w], t2i[t])
                                         for w, t in train_sentence])

wembs = [pWembs[wi] for wi in indexed_words]
noised_wembs = [dy.noise(we, 0.1) for we in wembs]

rnn_outputs = rnn_builder.transduce(noised_wembs)

errs = []
for rnn_output, gold_tag in zip(rnn_outputs, indexed_gold_tags):
    hidden = dy.tanh(dy.affine_transform([Hb, H, rnn_output]))
    def transduce(self, input, _true_output=None, feats=None):

        # convert _true_output string to list of vocabulary indeces
        if _true_output:
            try:
                true_output = [self.char_vocab.w2i[a] for a in _true_output]
            except:
                print a
                print _true_output
            true_output += [self.STOP]
            true_output = list(reversed(true_output))

        R = dy.parameter(self.R)  # hidden to vocabulary
        bias = dy.parameter(self.bias)
        W_c = dy.parameter(self.W_c)
        W__a = dy.parameter(self.W__a)
        U__a = dy.parameter(self.U__a)
        v__a = dy.parameter(self.v__a)

        # biLSTM encoder of input string
        input = [BEGIN_CHAR] + [c for c in input] + [STOP_CHAR]

        input_emb = []
        for char_ in reversed(input):
            char_id = self.char_vocab.w2i.get(char_, self.UNK)
            char_embedding = self.VOCAB_LOOKUP[char_id]
            input_emb.append(char_embedding)
        biencoder = self.bilstm_transduce(self.fbuffRNN, self.bbuffRNN,
                                          input_emb)

        losses = []
        output = []
        pred_history = [self.BEGIN]  # <
        s = self.decoder.initial_state()

        while not len(pred_history) == MAX_PRED_SEQ_LEN:
            # compute probability over vocabulary and choose a prediction
            # either from the true prediction at train time or based on the model at test time

            # decoder next state
            prev_pred_id = pred_history[-1]
            s = s.add_input(self.VOCAB_LOOKUP[prev_pred_id])

            # soft attention vector
            scores = [
                v__a * dy.tanh(W__a * s.output() + U__a * h_input)
                for h_input in biencoder
            ]
            alphas = dy.softmax(dy.concatenate(scores))
            c = dy.esum([
                h_input * dy.pick(alphas, j)
                for j, h_input in enumerate(biencoder)
            ])

            # softmax over vocabulary
            h_output = dy.tanh(W_c * dy.concatenate([s.output(), c]))
            probs = dy.softmax(R * h_output + bias)

            if _true_output is None:
                pred_id = np.argmax(probs.npvalue())
            else:
                pred_id = true_output.pop()

            losses.append(-dy.log(dy.pick(probs, pred_id)))
            pred_history.append(pred_id)

            if pred_id == self.STOP:
                break
            else:
                pred_char = self.char_vocab.i2w.get(pred_id, UNK_CHAR)
                output.append(pred_char)

        output = u''.join(output)
        return ((dy.average(losses) if losses else None), output)
Exemplo n.º 33
0
def run_test(train_path,
             test_path,
             test_output_path,
             hidden_layer,
             embedding_size,
             context_size,
             saved_model_path,
             activate_sub_word=False):

    from time import gmtime, strftime

    print strftime("%Y-%m-%d %H:%M:%S", gmtime())

    TRAIN, VOCAB, LABELS = t1.read_data(train_path)  # get training set
    TEST, VOCAB_TEST = read_test_data(test_path)  # get dev set

    VOCAB["<BOS>"] = 2  # a word for representing the beginning of a sentence
    VOCAB["<EOS>"] = 2  # a word for representing the end of a sentence
    VOCAB["<UNK>"] = 1

    # change TEST dataset by replacing words that did not appear in the vocab with <UNK>
    TEST_ORIG = list(TEST)
    TEST = []
    for sentence in TEST_ORIG:
        new_sentence = []
        for word in sentence:
            word = word.lower()
            if word in VOCAB.keys():
                new_sentence.append(word)
            else:
                new_sentence.append("<UNK>")
        TEST.append(new_sentence)

    # if the feature for taking into account sub words is activated
    if activate_sub_word == True:
        for word in VOCAB.keys():
            if word not in ["<BOS>", "<EOS>", "<UNK>"]:
                words_from_word = get_sub_words(word)
                for part_word in words_from_word:
                    if VOCAB.has_key(part_word) == False:
                        VOCAB[part_word] = 1

    m = dy.ParameterCollection()  # create parameter collection

    #define parameters
    pW1 = m.add_parameters((hidden_layer, embedding_size * context_size))
    pb1 = m.add_parameters(hidden_layer)
    pW2 = m.add_parameters((len(LABELS), hidden_layer))
    pb2 = m.add_parameters(len(LABELS))
    params = [pW1, pb1, pW2, pb2]
    e = m.add_lookup_parameters((len(VOCAB), embedding_size))  #
    # load the parameters
    print dy.parameter(pW1).value()
    m.populate(saved_model_path)
    print dy.parameter(pW1).value()

    L2I = {l: i
           for i, l in enumerate(list(sorted(LABELS.keys())))
           }  # enumerate the labels as 0,1,2,...
    F2I = {f: i
           for i, f in enumerate(list(sorted(VOCAB.keys())))
           }  # enumerate the vocabulary as 0,1,2,...

    predictions = prediction(TEST, L2I, F2I, params, e, activate_sub_word)

    # write predictions to file
    file = open(test_output_path, "w")
    for idx, sentence in enumerate(predictions):
        word_idx = 0
        for word, label in sentence:
            orig_word = TEST_ORIG[idx][word_idx]
            file.write(orig_word + " " + label + "\n")
            word_idx += 1
        file.write("\n")
    def transduce(self, lemma, feats, oracle_actions=None, external_cg=True, sampling=False,
                  unk_avg=True, verbose=False):
        """
        Transduce an encoded lemma and features.
        Args:
            lemma: The input lemma, a list of integer character codes.
            feats: The features determining the morphological transformation. The most common
                   format is a list of integer codes, one code per feature-value pair.
            oracle_actions: `None` means prediction.
                            List of action codes is a static oracle.
                            A dictionary of keys (explained below) is the config for a dynamic oracle.
                                * "target_word": List of action codes for the target word form.
                                * "loss": Which loss function to use (softmax-margin, NLL, MSE).
                                * "rollout_mixin_beta": How to mix reference and learned roll-outs
                                    (1 is only reference, 0 is only model).
                                * "global_rollout": Whether to use one type of roll-out (expert or model)
                                    at the sequence level.
                                * "optimal": Whether to use an optimal or noisy (=buggy) expert
                                * "bias_inserts": Whether to use a buggy roll-out for inserts
                                    (which makes them as cheap as copies)
            external_cg: Whether or not an external computation graph is defined.
            sampling: Whether or not sampling should be used for decoding (e.g. for MRT) or
                      training (e.g. dynamic oracles with exploration / learned roll-ins).
            dynamic: Whether or not `oracle_actions` is a static oracle (list of actions) or a confuguration
                     for a dynamic oracle.
            unk_avg: Whether or not to average all char embeddings to produce UNK embedding
                     (see `self._build_lemma`).
            verbose: Whether or not to report on processing steps.
        """
        # Returns an expression of the loss for the sequence of actions.
        # (that is, the oracle_actions if present or the predicted sequence otherwise)
        def _valid_actions(encoder):
            valid_actions = []
            if len(encoder) > 1:
                valid_actions += [COPY, DELETE]
            else:
                valid_actions += [END_WORD]
            valid_actions += self.INSERTS
            return valid_actions

        if not external_cg:
            dy.renew_cg()

        dynamic = None  # indicates prediction or static

        if oracle_actions:
            # if not, then prediction
            if isinstance(oracle_actions, dict):
                # dynamic oracle:
                # @TODO NB target word is not wrapped in boundary tags
                target_word = oracle_actions['target_word']
                generation_errors = set()
                dynamic = oracle_actions
            else:
                # static oracle:
                # reverse to enable simple popping
                oracle_actions = oracle_actions[::-1]
                oracle_actions.pop()  # COPY of BEGIN_WORD_CHAR

        # vectorize lemma
        lemma_enc = self._build_lemma(lemma, unk_avg, is_training=bool(oracle_actions))

        # vectorize features
        features = self._build_features(*feats)

        # add encoder and decoder to computation graph
        encoder = Encoder(self.fbuffRNN, self.bbuffRNN)
        decoder = self.wordRNN.initial_state()

        # add classifier to computation graph
        if self.MLP_DIM:
            # decoder output to hidden
            W_s2h = dy.parameter(self.pW_s2h)
            b_s2h = dy.parameter(self.pb_s2h)
        # hidden to action
        W_act = dy.parameter(self.pW_act)
        b_act = dy.parameter(self.pb_act)

        # encoder is a stack which pops lemma characters and their
        # representations from the top. Thus, to get lemma characters
        # in the right order, the lemma has to be reversed.
        encoder.transduce(lemma_enc, lemma)

        encoder.pop()  # BEGIN_WORD_CHAR
        action_history = [COPY]
        word = []
        losses = []

        if verbose and not dynamic:
            count = 0
            print()
            print(action2string(oracle_actions, self.vocab))
            print(lemma2string(lemma, self.vocab))
            
            
        if dynamic:
            # use model rollout for the whole of this sequence
            rollout_on = dynamic['global_rollout'] and np.random.rand() > dynamic['rollout_mixin_beta']
        
        while len(action_history) <= MAX_ACTION_SEQ_LEN:
            
            if verbose and not dynamic:
                print('Action: ', count, self.vocab.act.i2w[action_history[-1]])
                print('Encoder length, char: ', lemma, len(encoder), self.vocab.char.i2w[encoder.s[-1][-1]])
                print('word: ', ''.join(word))
                print(('Remaining actions: ', oracle_actions, action2string(oracle_actions, self.vocab)))
                count += 1
            
            # compute probability of each of the actions and choose an action
            # either from the oracle or if there is no oracle, based on the model
            valid_actions = _valid_actions(encoder)
            encoder_embedding = encoder.embedding()
            # decoder
            decoder_input = dy.concatenate([encoder_embedding,
                                            features,
                                            self.ACT_LOOKUP[action_history[-1]]
                                           ])
            decoder = decoder.add_input(decoder_input)
            # classifier
            if self.double_feats:
                classifier_input = dy.concatenate([decoder.output(), features])
            else:
                classifier_input = decoder.output()
            if self.MLP_DIM:
                h = self.NONLIN(W_s2h * classifier_input + b_s2h)
            else:
                h = classifier_input
            logits = W_act * h + b_act
            # get action (argmax, sampling, or use oracle actions)
            if oracle_actions is None:
                # predicting by argmax or sampling
                logits_cpu = logits # dy.to_device(logits, 'CPU')
                log_probs = dy.log_softmax(logits, valid_actions)
                log_probs_np = log_probs.npvalue()
                if sampling:
                    action = sample(log_probs_np)
                else:
                    action = np.argmax(log_probs_np)
                losses.append(dy.pick(log_probs, action))
            elif dynamic:
                # training with dynamic oracle
                if rollout_on or (not dynamic['global_rollout'] and np.random.rand() > dynamic['rollout_mixin_beta']):
                    # the second disjunct allows for model roll-out applied locally
                    logits_cpu = logits # dy.to_device(logits, 'CPU')
                    rollout = lambda action: self.rollout(action, dy.log_softmax(logits, valid_actions),
                                                          action_history, features, decoder, encoder, word,
                                                          W_act, b_act)  # @TODO W_s2h ...
                else:
                    rollout = None
                
                optim_actions, costs = oracle_with_rollout(word, target_word, encoder.get_extra(),
                                                           valid_actions, rollout, self.vocab,
                                                           optimal=dynamic['optimal'],
                                                           bias_inserts=dynamic['bias_inserts'],
                                                           errors=generation_errors,
                                                           verbose=verbose)
                logits_cpu = logits # dy.to_device(logits, 'CPU')
                log_probs = dy.log_softmax(logits, valid_actions)
                log_probs_np = log_probs.npvalue()
                if sampling == 1. or np.random.rand() <= sampling:
                    # action is picked by sampling
                    action = sample(log_probs_np)
                    # @TODO IL learned roll-ins are done with policy i.e. greedy / beam search decoding
                    if verbose: print('Rolling in with model: ', action, self.vocab.act.i2w[action]) 
                else:
                    # action is picked from optim_actions
                    action = optim_actions[np.argmax([log_probs_np[a] for a in optim_actions])]
                    #print [log_probs_np[a] for a in optim_actions]
                # loss is over all optimal actions.
                
                if dynamic['loss'] == 'softmax-margin':
                    loss = log_sum_softmax_margin_loss(optim_actions, logits, self.NUM_ACTS,
                                                       costs=costs, valid_actions=None, verbose=verbose)
                elif dynamic['loss'] == 'nll':
                    loss = log_sum_softmax_loss(optim_actions, logits, self.NUM_ACTS, 
                                                valid_actions=valid_actions, verbose=verbose)
                elif dynamic['loss'] == 'mse':
                    loss = cost_sensitive_reg_loss(optim_actions, logits, self.NUM_ACTS,
                                                   # NB expects both costs and valid actions!
                                                   costs=costs, valid_actions=valid_actions, verbose=verbose)
                ################
                else:
                    raise NotImplementedError
                losses.append(loss)
                #print 'Action'
                #print action
                #print self.vocab.act.i2w[action]
            else:
                # training with static oracle
                action = oracle_actions.pop()
                log_probs = dy.log_softmax(logits, valid_actions)
                losses.append(dy.pick(log_probs, action))

            action_history.append(action)

            #print 'action, log_probs: ', action, self.vocab.act.i2w[action], losses[-1].scalar_value(), log_probs.npvalue()
            
            # execute the action to update the transducer state
            if action == COPY:
                # 1. Increment attention index
                try:
                    char_ = encoder.pop()
                except IndexError as e:
                    print(np.exp(log_probs.npvalue()))
                    print('COPY: ', action)
                # 2. Append copied character to the output word
                word.append(self.vocab.char.i2w[char_])
            elif action == DELETE:               
                # 1. Increment attention index
                try:
                    encoder.pop()
                except IndexError as e:
                    print(np.exp(log_probs.npvalue()))
                    print('DELETE: ', action)
            elif action == END_WORD:
                # 1. Finish transduction
                break
            else:
                # one of the INSERT actions
                assert action in self.INSERTS
                # 1. Append inserted character to the output word
                char_ = self.vocab.act.i2w[action]
                word.append(char_)
                
        word = ''.join(word)

        return losses, word, action_history
Exemplo n.º 35
0
regression_hidden_size = 300

#pretrained embeddings
embedding_dim = emb_matrix_pretrained.shape[1]
embedding_parameters = RNN_model.lookup_parameters_from_numpy(emb_matrix_pretrained)

#add RNN unit
fw_RNN_unit = dy.LSTMBuilder(num_layers, embedding_dim, hidden_size, RNN_model)
bw_RNN_unit = dy.LSTMBuilder(num_layers, embedding_dim, hidden_size, RNN_model)

second_fw_RNN_unit = dy.LSTMBuilder(num_layers, 2*hidden_size, hidden_size, RNN_model)
second_bw_RNN_unit = dy.LSTMBuilder(num_layers, 2*hidden_size, hidden_size, RNN_model)
 
pv1 = RNN_model.add_parameters(
        (regression_hidden_size, 2*hidden_size))
dy.parameter(pv1).npvalue().shape

 
pb1 = RNN_model.add_parameters(
        (regression_hidden_size)        
)
dy.parameter(pb1).npvalue().shape



pv2 = RNN_model.add_parameters(
        (regression_hidden_size))
dy.parameter(pv2).npvalue().shape

 
pb2 = RNN_model.add_parameters(
    def transduce(self,
                  lemma,
                  feats,
                  oracle_actions=None,
                  external_cg=True,
                  sampling=False,
                  unk_avg=True,
                  debug_mode=False):
        def _valid_actions(encoder):
            valid_actions = list(self.INSERTS)
            if len(encoder) > 1:
                valid_actions += [STEP]
            else:
                valid_actions += [END_WORD]
            return valid_actions

        if not external_cg:
            dy.renew_cg()

        if oracle_actions:
            # reverse to enable simple popping
            oracle_actions = oracle_actions[::-1]
            oracle_actions.pop()  # Deterministic insertion of BEGIN_WORD

        # vectorize lemma
        lemma_enc = self._build_lemma(lemma,
                                      unk_avg,
                                      is_training=bool(oracle_actions))

        # vectorize features
        features = self._build_features(*feats)

        # add encoder and decoder to computation graph
        encoder = Encoder(self.fbuffRNN, self.bbuffRNN)
        decoder = self.wordRNN.initial_state()

        # add classifier to computation graph
        if self.MLP_DIM:
            # decoder output to hidden
            W_s2h = dy.parameter(self.pW_s2h)
            b_s2h = dy.parameter(self.pb_s2h)
        # hidden to action
        W_act = dy.parameter(self.pW_act)
        b_act = dy.parameter(self.pb_act)

        # encoder is a stack which pops lemma characters
        # and their representations from the top
        encoder.transduce(lemma_enc, lemma)

        action_history = [BEGIN_WORD]
        word = []
        losses = []
        count = 0

        if debug_mode:
            print
            if oracle_actions: print action2string(oracle_actions, self.vocab)
            print lemma2string(lemma, self.vocab)

        while len(action_history) <= MAX_ACTION_SEQ_LEN:

            # what is at the top of encoder?
            encoder_embedding, char_enc = encoder.embedding(extra=True)

            if debug_mode:
                print 'Action history: ', action_history, action2string(
                    action_history, self.vocab)
                print 'Encoder length: ', len(encoder)
                print 'Current char: ', char_enc, lemma2string([char_enc],
                                                               self.vocab)
                print 'Word so far: ', u''.join(word)

            # decoder
            decoder_input = dy.concatenate([
                encoder_embedding, features,
                self.ACT_LOOKUP[action_history[-1]]
            ])
            decoder = decoder.add_input(decoder_input)
            decoder_output = decoder.output()
            # classifier
            if self.MLP_DIM:
                h = self.NONLIN(W_s2h * decoder_output + b_s2h)
            else:
                h = decoder_output

            valid_actions = _valid_actions(encoder)
            log_probs = dy.log_softmax(W_act * h + b_act, valid_actions)

            if oracle_actions is None:
                if sampling:
                    dist = np.exp(log_probs.npvalue())
                    # sample according to softmax
                    rand = np.random.rand()
                    for action, p in enumerate(dist):
                        rand -= p
                        if rand <= 0: break
                else:
                    action = np.argmax(log_probs.npvalue())
            else:
                action = oracle_actions.pop()

            losses.append(dy.pick(log_probs, action))
            action_history.append(action)

            if action == STEP:
                # Delete action
                encoder.pop()

            elif action == END_WORD:
                # Finish transduction
                break
            else:
                # Insert action
                assert action in self.INSERTS, (char_,
                                                action2string([char_],
                                                              self.vocab),
                                                self.INSERTS)
                char_ = self.vocab.act.i2w[action]
                word.append(char_)

        word = u''.join(word)
        return losses, word, action_history
Exemplo n.º 37
0
 def transform(self, input_expr):
     W1 = dy.parameter(self.embeddings)
     b1 = dy.parameter(self.bias)
     return dy.affine_transform([b1, W1, input_expr])
Exemplo n.º 38
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

        Wq, Wk, Wv, Wo = [
            dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo)
        ]
        bq, bk, bv, bo = [
            dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo)
        ]

        # Start with a [(length, model_size) x batch] tensor
        x = expr_seq.as_transposed_tensor()
        x_len = x.dim()[0][0]
        x_batch = x.dim()[1]
        # Get the query key and value vectors
        # TODO: do we need bias broadcasting in DyNet?
        # q = dy.affine_transform([bq, x, Wq])
        # k = dy.affine_transform([bk, x, Wk])
        # v = dy.affine_transform([bv, x, Wv])
        q = bq + x * Wq
        k = bk + x * Wk
        v = bv + x * Wv

        # Split to batches [(length, head_dim) x batch * num_heads] tensor
        q, k, v = [
            dy.reshape(x, (x_len, self.head_dim),
                       batch_size=x_batch * self.num_heads) for x in (q, k, v)
        ]

        # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys
        attn_score = q * dy.transpose(k) / sqrt(self.head_dim)
        if expr_seq.mask is not None:
            mask = dy.inputTensor(np.repeat(
                expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(),
                                  batched=True) * -1e10
            attn_score = attn_score + mask
        attn_prob = dy.softmax(attn_score, d=1)
        if self.train and self.dropout > 0.0:
            attn_prob = dy.dropout(attn_prob, self.dropout)
        # Reduce using attention and resize to match [(length, model_size) x batch]
        o = dy.reshape(attn_prob * v, (x_len, self.input_dim),
                       batch_size=x_batch)
        # Final transformation
        # o = dy.affine_transform([bo, attn_prob * v, Wo])
        o = bo + o * Wo

        expr_seq = expression_seqs.ExpressionSequence(expr_transposed_tensor=o,
                                                      mask=expr_seq.mask)

        self._final_states = [
            transducers.FinalTransducerState(expr_seq[-1], None)
        ]

        return expr_seq
    def __step_batch(self, batch):
        dy.renew_cg()

        W_s = dy.parameter(self.W_s)
        b_s = dy.parameter(self.b_s)
        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        W_m = dy.parameter(self.W_m)
        b_m = dy.parameter(self.b_m)
        W1_att_f = dy.parameter(self.W1_att_f)
        w2_att = dy.parameter(self.w2_att)

        src_batch = [x[0] for x in batch]
        tgt_batch = [x[1] for x in batch]
        batch_size = len(src_batch)

        attended_batch = []
        for src_sent in src_batch:
            attended = []
            c_t_sense = dy.vecInput(self.embed_size)
            sense_start = dy.concatenate([
                self.lookup_frozen(self.src_lookup,
                                   self.src_token_to_id['<S>'][0]),
                dy.tanh(c_t_sense)
            ])
            sense_state = self.sense_builder.initial_state().add_input(
                sense_start)

            for cw in src_sent:
                cw_sense_ids = self.src_token_to_id[cw]
                cw_senses = [
                    self.lookup_frozen(self.src_lookup, sense_id)
                    for sense_id in cw_sense_ids
                ]
                h_senses = dy.concatenate_cols(cw_senses)
                h_m = sense_state.output()
                c_t_sense = self.__sense_attention_mlp(h_senses, h_m)
                sense_state = sense_state.add_input(
                    dy.concatenate([c_t_sense, dy.tanh(c_t_sense)]))
                attended.append(c_t_sense)

            attended_batch.append(attended)
        attended_batch_rev = [list(reversed(sent)) for sent in attended_batch]

        # Encoder
        src_cws_l2r = []
        src_cws_r2l = []
        src_len = [len(sent) for sent in attended_batch]
        max_src_len = np.max(src_len)

        for i in range(max_src_len):
            src_cws_l2r.append([sent[i] for sent in attended_batch])
            src_cws_r2l.append([sent[i] for sent in attended_batch_rev])

        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for i, (cws_l2r, cws_r2l) in enumerate(zip(src_cws_l2r, src_cws_r2l)):
            l2r_batch = dy.reshape(dy.concatenate_cols(cws_l2r),
                                   (self.embed_size, ),
                                   batch_size=batch_size)
            l2r_state = l2r_state.add_input(l2r_batch)
            r2l_batch = dy.reshape(dy.concatenate_cols(cws_r2l),
                                   (self.embed_size, ),
                                   batch_size=batch_size)
            r2l_state = r2l_state.add_input(r2l_batch)
            l2r_contexts.append(l2r_state.output())
            r2l_contexts.append(r2l_state.output())
        r2l_contexts.reverse()

        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)
        fixed_attentional_component = W1_att_f * h_fs_matrix

        losses = []
        num_words = 0

        # Decoder
        tgt_cws = []
        tgt_len = [len(sent) for sent in tgt_batch]
        max_tgt_len = np.max(tgt_len)
        masks = []

        for i in range(max_tgt_len):
            tgt_cws.append([
                self.tgt_token_to_id[sent[i]]
                if len(sent) > i else self.tgt_token_to_id['</S>']
                for sent in tgt_batch
            ])
            mask = [(1 if len(sent) > i else 0) for sent in tgt_batch]
            masks.append(mask)
            num_words += sum(mask)

        c_t = dy.vecInput(self.hidden_size * 2)
        start_state = dy.affine_transform([b_s, W_s, h_fs[-1]])
        dec_state = self.word_dec_builder.initial_state().set_s(
            [start_state, dy.tanh(start_state)])
        for i, (cws, nws, mask) in enumerate(zip(tgt_cws, tgt_cws[1:], masks)):
            embed_t = dy.lookup_batch(self.tgt_lookup, cws)
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            h_e = dec_state.output()
            c_t = self.__word_attention_mlp(h_fs_matrix, h_e,
                                            fixed_attentional_component)
            m_t = dy.tanh(
                dy.affine_transform([b_m, W_m,
                                     dy.concatenate([h_e, c_t])]))
            y_star = dy.affine_transform([b_y, W_y, m_t])
            loss = dy.pickneglogsoftmax_batch(y_star, nws)
            mask_expr = dy.inputVector(mask)
            mask_expr = dy.reshape(mask_expr, (1, ), len(batch))
            mask_loss = loss * mask_expr
            losses.append(mask_loss)

        return dy.sum_batches(dy.esum(losses)), num_words
    def translate_sentence(self, sent):
        dy.renew_cg()

        W_s = dy.parameter(self.W_s)
        b_s = dy.parameter(self.b_s)
        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        W_m = dy.parameter(self.W_m)
        b_m = dy.parameter(self.b_m)
        W1_att_f = dy.parameter(self.W1_att_f)
        w2_att = dy.parameter(self.w2_att)

        # Sense-level attention
        attended = []
        c_t_sense = dy.vecInput(self.embed_size)
        sense_start = dy.concatenate([
            self.lookup_frozen(self.src_lookup,
                               self.src_token_to_id['<S>'][0]), c_t_sense
        ])
        sense_state = self.sense_builder.initial_state().add_input(sense_start)
        for cw in sent:
            cw_sense_ids = self.src_token_to_id[cw]
            cw_senses = [
                self.lookup_frozen(self.src_lookup, sense_id)
                for sense_id in cw_sense_ids
            ]
            h_senses = dy.concatenate_cols(cw_senses)
            h_m = sense_state.output()
            c_t_sense = self.__sense_attention_mlp(h_senses, h_m)
            sense_state = sense_state.add_input(
                dy.concatenate([c_t_sense, dy.tanh(c_t_sense)]))
            attended.append(c_t_sense)

        attended_rev = list(reversed(attended))

        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for (cw_l2r, cw_r2l) in zip(attended, attended_rev):
            l2r_state = l2r_state.add_input(cw_l2r)
            r2l_state = r2l_state.add_input(cw_r2l)
            l2r_contexts.append(l2r_state.output())
            r2l_contexts.append(r2l_state.output())
        r2l_contexts.reverse()

        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)
        fixed_attentional_component = W1_att_f * h_fs_matrix

        # Decoder
        trans_sentence = ['<S>']
        cw = trans_sentence[-1]
        c_t = dy.vecInput(self.hidden_size * 2)
        start_state = dy.affine_transform([b_s, W_s, h_fs[-1]])
        dec_state = self.word_dec_builder.initial_state().set_s(
            [start_state, dy.tanh(start_state)])
        while len(trans_sentence) < self.max_len:
            embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw])
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            h_e = dec_state.output()
            c_t = self.__word_attention_mlp(h_fs_matrix, h_e,
                                            fixed_attentional_component)
            m_t = dy.tanh(
                dy.affine_transform([b_m, W_m,
                                     dy.concatenate([h_e, c_t])]))
            y_star = dy.affine_transform([b_y, W_y, m_t])
            p = dy.softmax(y_star)
            cw = self.tgt_id_to_token[np.argmax(p.vec_value())]
            if cw == '</S>':
                break
            trans_sentence.append(cw)

        return ' '.join(trans_sentence[1:])
    def __step(self, instance):
        dy.renew_cg()

        W_s = dy.parameter(self.W_s)
        b_s = dy.parameter(self.b_s)
        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        W_m = dy.parameter(self.W_m)
        b_m = dy.parameter(self.b_m)
        W1_att_f = dy.parameter(self.W1_att_f)
        W1_att_e = dy.parameter(self.W1_att_e)
        w2_att = dy.parameter(self.w2_att)

        src_sent, tgt_sent = instance

        # Sense-level attention
        attended = []
        c_t_sense = dy.vecInput(self.embed_size)
        sense_start = dy.concatenate([
            self.lookup_frozen(self.src_lookup,
                               self.src_token_to_id['<S>'][0]),
            dy.tanh(c_t_sense)
        ])
        sense_state = self.sense_builder.initial_state().add_input(sense_start)
        for cw in src_sent:
            cw_sense_ids = self.src_token_to_id[cw]
            cw_senses = [
                self.lookup_frozen(self.src_lookup, sense_id)
                for sense_id in cw_sense_ids
            ]
            h_senses = dy.concatenate_cols(cw_senses)
            h_m = sense_state.output()
            c_t_sense = self.__sense_attention_mlp(h_senses, h_m)
            sense_state = sense_state.add_input(
                dy.concatenate([c_t_sense, dy.tanh(c_t_sense)]))
            attended.append(c_t_sense)

        attended_rev = list(reversed(attended))

        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for (cw_l2r, cw_r2l) in zip(attended, attended_rev):
            l2r_state = l2r_state.add_input(cw_l2r)
            r2l_state = r2l_state.add_input(cw_r2l)
            l2r_contexts.append(l2r_state.output())
            r2l_contexts.append(r2l_state.output())
        r2l_contexts.reverse()

        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)
        fixed_attentional_component = W1_att_f * h_fs_matrix

        losses = []
        num_words = 0

        # Decoder
        c_t = dy.vecInput(self.hidden_size * 2)
        start_state = dy.affine_transform([b_s, W_s, h_fs[-1]])
        dec_state = self.word_dec_builder.initial_state().set_s(
            [start_state, dy.tanh(start_state)])
        for (cw, nw) in zip(tgt_sent, tgt_sent[1:]):
            embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw])
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            h_e = dec_state.output()
            c_t = self.__word_attention_mlp(h_fs_matrix, h_e,
                                            fixed_attentional_component)
            m_t = dy.tanh(
                dy.affine_transform([b_m, W_m,
                                     dy.concatenate([h_e, c_t])]))
            y_star = dy.affine_transform([b_y, W_y, m_t])
            loss = dy.pickneglogsoftmax(y_star, self.tgt_token_to_id[nw])
            losses.append(loss)
            num_words += 1

        return dy.esum(losses), num_words
Exemplo n.º 42
0
    def generate(self, s_sentence, max_len=150):

        dy.renew_cg()

        global beam_size

        W_y = dy.parameter(self.params["W_y"])
        b_y = dy.parameter(self.params["b_y"])
        s_lookup = self.params["s_lookup"]
        t_lookup = self.params["t_lookup"]

        s_sentence = [self.s_vocab[EOS]] + s_sentence + [self.s_vocab[EOS]]
        s_sentence_rev = list(reversed(s_sentence))

        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []

        for cw_l2r in s_sentence:
            l2r_state = l2r_state.add_input(s_lookup[cw_l2r])
            l2r_contexts.append(l2r_state.output())

        for cw_r2l in s_sentence_rev:
            r2l_state = r2l_state.add_input(s_lookup[cw_r2l])
            r2l_contexts.append(r2l_state.output())

        r2l_contexts.reverse()

        H_f = []
        H_f = [dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts)]

        H_f_mat = dy.concatenate_cols(H_f)
        W1_att = dy.parameter(self.params["W1_att"])
        w1dt = W1_att * H_f_mat

        c_t_init = dy.vecInput(2*self.HIDDEN_DIM)
        # c_t = dy.concatenate([l2r_contexts[-1], r2l_contexts[-1]])

        dec_state_init = self.dec_builder.initial_state()

        possible_list = {("<EOS>", dec_state_init, c_t_init): 0.0}

        for i in range(len(s_sentence)*2):
            t_list = {}

            count_eos = 0

            for (poss, dec_state, c_t), prob in possible_list.iteritems():
                spl_poss = poss.split(' ')

                if i > 1 and spl_poss[-1] == "<EOS>":
                    count_eos += 1
                    t_list[(poss, dec_state, c_t)] = prob
                    continue

                embedding = t_lookup[self.t_vocab[spl_poss[-1]]]

                x_t = dy.concatenate([c_t, embedding])
                dec_state = dec_state.add_input(x_t)
                c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_sentence), 1)
                probs = dy.softmax(W_y*dy.concatenate([c_t, dec_state.output()]) + b_y).vec_value()

                inds = self.list_nlargest(probs, beam_size)

                for ind in inds:
                    sent = poss + " " + self.t_id_lookup[ind]
                    sent_prob = prob + math.log(probs[ind])

                    # lp = (5 + len(sent.split()))/(5+1)

                    # sent_prob = sent_prob/pow(lp, alpha)

                    t_list[(sent, dec_state, c_t)] = sent_prob

            if count_eos == beam_size:
                break

            possible_list = {}

            for tup in self.dict_nlargest(t_list, beam_size):
                possible_list[tup] = t_list[tup]

        final_sent = self.dict_nlargest(possible_list, 1)[0][0]
        return " ".join(final_sent.replace("<EOS>", "").strip().split())
Exemplo n.º 43
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.dim()[1] > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = {}
        c = {}
        h_list = []

        batch_size = expr_seq.dim()[1]
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for i, cur_node_id in enumerate(lattice.graph.topo_sort()):
            prev_node = lattice.graph.predecessors(cur_node_id)
            val = expr_seq[i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(prev_node) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in prev_node)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in prev_node:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(prev_node) == 0:
                c[cur_node_id] = dy.cmult(i_it, i_gt)
            else:
                fc = dy.cmult(i_ft_list[0], c[prev_node[0]])
                for i in range(1, len(prev_node)):
                    fc += dy.cmult(i_ft_list[i], c[prev_node[i]])
                c[cur_node_id] = fc + dy.cmult(i_it, i_gt)
            h_t = dy.cmult(i_ot, dy.tanh(c[cur_node_id]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h[cur_node_id] = h_t
            h_list.append(h_t)
        self._final_states = [
            transducers.FinalTransducerState(h_list[-1], h_list[-1])
        ]
        return expression_seqs.ExpressionSequence(expr_list=h_list)
Exemplo n.º 44
0
    def _decoder_input_embedding(self,
                                 rnn_state,
                                 previous_triple,
                                 encoded_string,
                                 enc_state,
                                 encoded_history,
                                 training=False,
                                 initial_state=None):
        attention_vecs = {}

        # Compute attention over encodded string.
        utterance_attn, utterance_dist = attend(encoded_string,
                                                rnn_state.h()[-1],
                                                dy.parameter(self._utterance_attention_w),
                                                self._dropout if training else 0.)
        attention_vecs['utterance'] = utterance_dist

        # Key for state and history attention.
        attn_key = dy.concatenate([utterance_attn, rnn_state.h()[-1]])
        if training:
            attn_key = dy.dropout(attn_key, self._dropout)

        # Attend on history using current state and utterance attention.
        history_attn, history_dist = attend(encoded_history,
                                            attn_key,
                                            dy.parameter(self._history_attention_w),
                                            self._dropout if training else 0.)
        attention_vecs['history'] = history_dist

        # Attend on state.
        state_attn, state_dist = attend(enc_state,
                                        attn_key,
                                        dy.parameter(self._state_attention_w),
                                        self._dropout if training else 0.)
        state_attn2, state_dist2 = attend(enc_state,
                                          attn_key,
                                          dy.parameter(self._state_attention_w2),
                                          self._dropout if training else 0.)
        attention_vecs['state_1'] = state_dist
        attention_vecs['state_2'] = state_dist2

        # Compute previous embedding
        prev_emb = self._embed_predicted_triple(previous_triple)

        # Concatenate with history and state, and mix with a feed-forward
        # layer.
        situated_embedding = dy.concatenate([utterance_attn,
                                             history_attn,
                                             state_attn,
                                             state_attn2,
                                             prev_emb])

        # Attend on initial state (if provided)
        if self.args.feed_updated_state and self.args.always_initial_state:
            if not initial_state:
                raise ValueError("Encoding the initial state but it was not provided.")
            initial_attn, initial_dist = attend(initial_state,
                                                attn_key,
                                                dy.parameter(self._state_attention_winitial),
                                                self._dropout if training else 0.)
            initial_attn2, initial_dist2 = attend(initial_state,
                                                  attn_key,
                                                  dy.parameter(self._state_attention_winitial2),
                                                  self._dropout if training else 0.)
            attention_vecs['initial_1'] = initial_dist
            attention_vecs['initial_2'] = initial_dist2

            situated_embedding = dy.concatenate([situated_embedding,
                                                 initial_attn,
                                                 initial_attn2])

        # Situated embedding mixing parameters.
        weights = dy.parameter(self._situated_w)
        biases = dy.parameter(self._situated_b)

        situated_embedding = dy.tanh(weights * situated_embedding + biases)

        return situated_embedding, attention_vecs
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])

    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)

    # get the outputs of the first LSTM
    src_outputs = [dy.concatenate([x.output(), y.output()]) for x, y in
                   LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])]
    src_output = src_outputs[-1]

    # gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix

    # now decode
    all_losses = []

    # Decoder
    # need to mask padding at end of sentence

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        # feed the current state into the
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()
        att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,), len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Exemplo n.º 46
0
 def embed_sent(self, sent_len):
     embeddings = dy.strided_select(dy.parameter(self.embeddings), [1, 1],
                                    [0, 0], [self.emb_dim, sent_len])
     return expression_seqs.ExpressionSequence(expr_tensor=embeddings,
                                               mask=None)
Exemplo n.º 47
0
def one_word_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, aligned_pair,
                  feat_index, feature_types):
    pc.renew_cg()

    # read the parameters
    # char_lookup = model["char_lookup"]
    # feat_lookup = model["feat_lookup"]
    # R = pc.parameter(model["R"])
    # bias = pc.parameter(model["bias"])
    R = pc.parameter(R)
    bias = pc.parameter(bias)

    padded_lemma = BEGIN_WORD + lemma + END_WORD

    # convert characters to matching embeddings
    lemma_char_vecs = encode_lemma(alphabet_index, char_lookup, padded_lemma)

    # convert features to matching embeddings, if UNK handle properly
    feat_vecs = encode_feats(feat_index, feat_lookup, feats, feature_types)

    feats_input = pc.concatenate(feat_vecs)

    blstm_outputs = bilstm_transduce(encoder_frnn, encoder_rrnn, lemma_char_vecs)

    # initialize the decoder rnn
    s_0 = decoder_rnn.initial_state()
    s = s_0

    # set prev_output_vec for first lstm step as BEGIN_WORD
    prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]]
    loss = []

    # i is input index, j is output index
    i = 0
    j = 0

    # go through alignments, progress j when new output is introduced, progress i when new char is seen on lemma (no ~)
    aligned_lemma, aligned_word = aligned_pair
    aligned_lemma += END_WORD
    aligned_word += END_WORD

    # run through the alignments
    for align_index, (input_char, output_char) in enumerate(zip(aligned_lemma, aligned_word)):
        possible_outputs = []

        # feedback, blstm[i], feats
        decoder_input = pc.concatenate([prev_output_vec, blstm_outputs[i], feats_input])

        # if reached the end word symbol
        if output_char == END_WORD:
            s = s.add_input(decoder_input)
            decoder_rnn_output = s.output()
            probs = pc.softmax(R * decoder_rnn_output + bias)

            # compute local loss
            loss.append(-pc.log(pc.pick(probs, alphabet_index[END_WORD])))
            continue

        # initially, if there is no prefix in the output (shouldn't delay on current input), step forward
        # TODO: check if can remove this condition entirely by adding '<' to both the aligned lemma/word
        if padded_lemma[i] == BEGIN_WORD and aligned_lemma[align_index] != ALIGN_SYMBOL:

            # perform rnn step
            s = s.add_input(decoder_input)
            decoder_rnn_output = s.output()
            probs = pc.softmax(R * decoder_rnn_output + bias)

            # compute local loss
            loss.append(-pc.log(pc.pick(probs, alphabet_index[STEP])))

            # prepare for the next iteration - "feedback"
            prev_output_vec = char_lookup[alphabet_index[STEP]]
            prev_char_vec = char_lookup[alphabet_index[EPSILON]]
            i += 1

        # if 0-to-1 or 1-to-1 alignment, compute loss for predicting the output symbol
        if aligned_word[align_index] != ALIGN_SYMBOL:
            decoder_input = pc.concatenate([prev_output_vec, blstm_outputs[i], feats_input])

            # feed new input to decoder
            s = s.add_input(decoder_input)
            decoder_rnn_output = s.output()
            probs = pc.softmax(R * decoder_rnn_output + bias)

            if aligned_word[align_index] in alphabet_index:
                current_loss = -pc.log(pc.pick(probs, alphabet_index[aligned_word[align_index]]))

                # prepare for the next iteration - "feedback"
                prev_output_vec = char_lookup[alphabet_index[aligned_word[align_index]]]
            else:
                current_loss = -pc.log(pc.pick(probs, alphabet_index[UNK]))

                # prepare for the next iteration - "feedback"
                prev_output_vec = char_lookup[alphabet_index[UNK]]
            loss.append(current_loss)

            j += 1

        # if the input's not done and the next is not a 0-to-1 alignment, perform step
        if i < len(padded_lemma) - 1 and aligned_lemma[align_index + 1] != ALIGN_SYMBOL:
            # perform rnn step
            # feedback, blstm[i], feats
            decoder_input = pc.concatenate([prev_output_vec, blstm_outputs[i], feats_input])

            s = s.add_input(decoder_input)
            decoder_rnn_output = s.output()
            probs = pc.softmax(R * decoder_rnn_output + bias)

            # compute local loss for the step action
            loss.append(-pc.log(pc.pick(probs, alphabet_index[STEP])))

            # prepare for the next iteration - "feedback"
            prev_output_vec = char_lookup[alphabet_index[STEP]]
            i += 1

    # loss = esum(loss)
    loss = pc.average(loss)

    return loss
Exemplo n.º 48
0
def calc_scores(words):
    dy.renew_cg()
    b_sm_exp = dy.parameter(b_sm)
    score = dy.esum([dy.lookup(W_sm, x) for x in words])
    return score + b_sm_exp
Exemplo n.º 49
0
    def get_output(self, input, h_start=None, c_start=None):
        '''
        apply the LSTM to a matrix or list of vectors
        
        input - a list of vectors of dimension d
        h_start - optional start state for continuation (default is to start at the beginning with h_0)
        c_start - optional start cell for continuation (default is to start at the beginning with c_0)
        '''

        W_f = parameter(self.W_f)
        U_f = parameter(self.U_f)
        b_f = parameter(self.b_f)

        W_i = parameter(self.W_i)
        U_i = parameter(self.U_i)
        b_i = parameter(self.b_i)

        W_o = parameter(self.W_o)
        U_o = parameter(self.U_o)
        b_o = parameter(self.b_o)

        W_c = parameter(self.W_c)
        U_c = parameter(self.U_c)
        b_c = parameter(self.b_c)

        if h_start is None:
            h_0 = parameter(self.h_0)
        else:
            h_0 = h_start
        if c_start is None:
            c_0 = parameter(self.c_0)
        else:
            c_0 = c_start

        #IMPLEMENT YOUR LSTM CODE HERE
        result = []
        hidden = []
        cell = []
        h_t = h_0
        c_t = c_0
        if not self.reverse:
            for i in range(0, len(input), 1):
                f_t = logistic(W_f * input[i] + U_f * h_t + b_f)
                i_t = logistic(W_i * input[i] + U_i * h_t + b_i)
                o_t = logistic(W_o * input[i] + U_o * h_t + b_o)
                c_t = cmult(f_t, c_t) + cmult(
                    i_t, tanh(W_c * input[i] + U_c * h_t + b_c))
                h_t = cmult(o_t, tanh(c_t))
                hidden.append(h_t)
                cell.append(c_t)
        else:
            for i in range(len(input) - 1, -1, -1):
                f_t = logistic(W_f * input[i] + U_f * h_t + b_f)
                i_t = logistic(W_i * input[i] + U_i * h_t + b_i)
                o_t = logistic(W_o * input[i] + U_o * h_t + b_o)
                c_t = cmult(f_t, c_t) + cmult(
                    i_t, tanh(W_c * input[i] + U_c * h_t + b_c))
                h_t = cmult(o_t, tanh(c_t))
                hidden.append(h_t)
                cell.append(c_t)
        result.append(hidden)
        result.append(cell)
        return result
Exemplo n.º 50
0
    def cal_scores(self, src_encodings, predict=False):

        src_len = len(src_encodings)
        src_encodings = dy.concatenate_cols(
            src_encodings)  # src_ctx_dim, src_len, batch_size
        batch_size = src_encodings.dim()[1]

        W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head)
        b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head)
        W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep)
        b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep)

        W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head)
        b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head)
        W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep)
        b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep)

        U_arc_1 = dy.parameter(self.U_arc_1)
        u_arc_2 = dy.parameter(self.u_arc_2)

        U_label_1 = [dy.parameter(x) for x in self.U_label_1]
        u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1]
        u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2]
        b_label = [dy.parameter(x) for x in self.b_label]

        if predict:
            h_arc_head = self.leaky_ReLu(
                dy.affine_transform([
                    b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings
                ]))  # n_arc_ml_units, src_len, bs
            h_arc_dep = self.leaky_ReLu(
                dy.affine_transform(
                    [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings]))
            h_label_head = self.leaky_ReLu(
                dy.affine_transform([
                    b_label_hidden_to_head, W_label_hidden_to_head,
                    src_encodings
                ]))
            h_label_dep = self.leaky_ReLu(
                dy.affine_transform([
                    b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings
                ]))
        else:

            src_encodings = dy.dropout_dim(src_encodings, 1,
                                           self.arc_mlp_dropout)

            h_arc_head = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_arc_hidden_to_head, W_arc_hidden_to_head,
                        src_encodings
                    ])), 1,
                self.arc_mlp_dropout)  # n_arc_ml_units, src_len, bs
            h_arc_dep = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings
                    ])), 1, self.arc_mlp_dropout)
            h_label_head = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_label_hidden_to_head, W_label_hidden_to_head,
                        src_encodings
                    ])), 1, self.label_mlp_dropout)
            h_label_dep = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_label_hidden_to_dep, W_label_hidden_to_dep,
                        src_encodings
                    ])), 1, self.label_mlp_dropout)

        h_arc_head_transpose = dy.transpose(h_arc_head)
        h_label_head_transpose = dy.transpose(h_label_head)

        s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep,
                                                      u_arc_2)

        s_label = []
        for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2,
                                        b_label):
            e1 = h_label_head_transpose * U_1 * h_label_dep
            e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len))
            e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep
            s_label.append(e1 + e2 + e3 + b)
        return s_arc, s_label
Exemplo n.º 51
0
    def build_tagging_graph(self, words, ltags):
        # parameters -> expressions
        self.w1 = dy.parameter(self.W1)
        self.b1 = dy.parameter(self.B1)
        self.w2 = dy.parameter(self.W2)
        self.b2 = dy.parameter(self.B2)
        self.xw1 = dy.parameter(self.xW1)
        self.xb1 = dy.parameter(self.xB1)
        self.xw2 = dy.parameter(self.xW2)
        self.xb2 = dy.parameter(self.xB2)

        # apply dropout
        if self.eval:
            self.disable_dropout()
        else:
            self.enable_dropout()

        # initialize the RNNs
        f_init = self.fwdRNN.initial_state()
        b_init = self.bwdRNN.initial_state()
        f2_init = self.fwdRNN2.initial_state()
        b2_init = self.bwdRNN2.initial_state()

        self.hcf_init = self.hcfwdRNN.initial_state()
        self.hcb_init = self.hcbwdRNN.initial_state()

        self.ecf_init = self.ecfwdRNN.initial_state()
        self.ecb_init = self.ecbwdRNN.initial_state()

        xf_init = self.xfwdRNN.initial_state()
        xb_init = self.xbwdRNN.initial_state()
        xf2_init = self.xfwdRNN2.initial_state()
        xb2_init = self.xbwdRNN2.initial_state()

        self.xhcf_init = self.xhcfwdRNN.initial_state()
        self.xhcb_init = self.xhcbwdRNN.initial_state()

        self.xecf_init = self.xecfwdRNN.initial_state()
        self.xecb_init = self.xecbwdRNN.initial_state()

        # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
        wembs = [self.word_rep(w, l) for w, l in zip(words, ltags)]
        cembs = [
            self.char_rep(w, self.hcf_init, self.hcb_init, self.ecf_init,
                          self.ecb_init, l) for w, l in zip(words, ltags)
        ]
        xembs = [dy.concatenate([w, c]) for w, c in zip(wembs, cembs)]

        # feed word vectors into biLSTM
        fw_exps = f_init.transduce(xembs)
        bw_exps = b_init.transduce(reversed(xembs))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
        ]

        # feed word vectors into biLSTM
        fw_exps = f2_init.transduce(bi_exps)
        bw_exps = b2_init.transduce(reversed(bi_exps))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
        ]

        # feed each biLSTM state to an MLP
        exps = []
        pos_hidden = []
        for xi in bi_exps:
            xh = self.w1 * xi
            #xh = self.meta.activation(xh) + self.b1
            pos_hidden.append(xh)

        cembs = [
            self.char_rep(w, self.xhcf_init, self.xhcb_init, self.xecf_init,
                          self.xecb_init, l) for w, l in zip(words, ltags)
        ]
        xembs = [
            dy.concatenate([w, c, p])
            for w, c, p in zip(wembs, cembs, pos_hidden)
        ]
        xfw_exps = xf_init.transduce(xembs)
        xbw_exps = xb_init.transduce(reversed(xembs))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b])
            for f, b in zip(xfw_exps, reversed(xbw_exps))
        ]

        # feed word vectors into biLSTM
        fw_exps = xf2_init.transduce(bi_exps)
        bw_exps = xb2_init.transduce(reversed(bi_exps))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
        ]

        exps = []
        for xi in bi_exps:
            xh = self.xw1 * xi
            xh = self.meta.activation(xh) + self.xb1
            xo = self.xw2 * xh + self.xb2
            exps.append(xo)

        return exps
Exemplo n.º 52
0
def predict_output_sequence(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, alphabet_index,
                            inverse_alphabet_index, feat_index, feature_types):
    pc.renew_cg()

    # read the parameters
    # char_lookup = model["char_lookup"]
    # feat_lookup = model["feat_lookup"]
    # R = pc.parameter(model["R"])
    # bias = pc.parameter(model["bias"])
    R = pc.parameter(R)
    bias = pc.parameter(bias)

    # convert characters to matching embeddings, if UNK handle properly
    padded_lemma = BEGIN_WORD + lemma + END_WORD
    lemma_char_vecs = encode_lemma(alphabet_index, char_lookup, padded_lemma)

    # convert features to matching embeddings, if UNK handle properly
    feat_vecs = encode_feats(feat_index, feat_lookup, feats, feature_types)

    feats_input = pc.concatenate(feat_vecs)

    blstm_outputs = bilstm_transduce(encoder_frnn, encoder_rrnn, lemma_char_vecs)

    # initialize the decoder rnn
    s_0 = decoder_rnn.initial_state()
    s = s_0

    # set prev_output_vec for first lstm step as BEGIN_WORD
    prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]]

    # i is input index, j is output index
    i = 0
    num_outputs = 0
    predicted_output_sequence = []

    # run the decoder through the sequence and predict characters, twice max prediction as step outputs are added
    while num_outputs < MAX_PREDICTION_LEN * 3:

        # prepare input vector and perform LSTM step
        decoder_input = pc.concatenate([prev_output_vec,
                                        blstm_outputs[i],
                                        feats_input])

        s = s.add_input(decoder_input)

        # compute softmax probs vector and predict with argmax
        decoder_rnn_output = s.output()
        probs = pc.softmax(R * decoder_rnn_output + bias)
        probs = probs.vec_value()
        predicted_output_index = common.argmax(probs)
        predicted_output = inverse_alphabet_index[predicted_output_index]
        predicted_output_sequence.append(predicted_output)

        # check if step or char output to promote i.
        if predicted_output == STEP:
            if i < len(padded_lemma) - 1:
                i += 1

        num_outputs += 1

        # check if reached end of word
        if predicted_output_sequence[-1] == END_WORD:
            break

        # prepare for the next iteration - "feedback"
        prev_output_vec = char_lookup[predicted_output_index]

    # remove the end word symbol

    return u''.join(predicted_output_sequence[0:-1])
Exemplo n.º 53
0
    def initialize_graph_nodes(self):
        #  convert parameters to expressions
        self.pad = dy.parameter(self.PAD)
        self.ps_W1 = dy.parameter(self.ps_pW1)
        self.ps_b1 = dy.parameter(self.ps_pb1)
        self.ps_W2 = dy.parameter(self.ps_pW2)
        self.ps_b2 = dy.parameter(self.ps_pb2)
        self.pr_W1 = dy.parameter(self.pr_pW1)
        self.pr_b1 = dy.parameter(self.pr_pb1)
        self.pr_W2 = dy.parameter(self.pr_pW2)
        self.pr_b2 = dy.parameter(self.pr_pb2)
        #######################################
        self.xpad = dy.parameter(self.XPAD)
        self.xps_W1 = dy.parameter(self.xps_pW1)
        self.xps_b1 = dy.parameter(self.xps_pb1)
        self.xps_W2 = dy.parameter(self.xps_pW2)
        self.xps_b2 = dy.parameter(self.xps_pb2)
        self.xpr_W1 = dy.parameter(self.xpr_pW1)
        self.xpr_b1 = dy.parameter(self.xpr_pb1)
        self.xpr_W2 = dy.parameter(self.xpr_pW2)
        self.xpr_b2 = dy.parameter(self.xpr_pb2)

        # apply dropout
        if self.eval:
            self.disable_dropout()
        else:
            self.enable_dropout()

        # initialize the RNNs
        self.f_init = self.fwdRNN.initial_state()
        self.b_init = self.bwdRNN.initial_state()

        self.cf_init_eng = self.ecfwdRNN.initial_state()
        self.cb_init_eng = self.ecbwdRNN.initial_state()
        self.cf_init_bh = self.bhcfwdRNN.initial_state()
        self.cb_init_bh = self.bhcbwdRNN.initial_state()
        ################################################
        self.xcf_init_eng = self.xecfwdRNN.initial_state()
        self.xcb_init_eng = self.xecbwdRNN.initial_state()
        self.xcf_init_bh = self.xbhcfwdRNN.initial_state()
        self.xcb_init_bh = self.xbhcbwdRNN.initial_state()

        self.ps_f_init = self.ps_fwdRNN.initial_state()
        self.ps_b_init = self.ps_bwdRNN.initial_state()
        ###############################################
        self.xps_f_init = self.xps_fwdRNN.initial_state()
        self.xps_b_init = self.xps_bwdRNN.initial_state()

        self.pr_f_init = self.pr_fwdRNN.initial_state()
        self.pr_b_init = self.pr_bwdRNN.initial_state()
        ###############################################
        self.xpr_f_init = self.xpr_fwdRNN.initial_state()
        self.xpr_b_init = self.xpr_bwdRNN.initial_state()
Exemplo n.º 54
0
    def translate_sentence_beam(self, sent):
        dy.renew_cg()

        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)

        sent_rev = list(reversed(sent))
        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for (cw_l2r, cw_r2l) in zip(sent, sent_rev):
            l2r_state = l2r_state.add_input(dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r]))
            r2l_state = r2l_state.add_input(dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l]))
            l2r_contexts.append(l2r_state.output())
            r2l_contexts.append(r2l_state.output())
        r2l_contexts.reverse()

        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)

        # Decoder
        cws = []
        c_t = dy.vecInput(self.hidden_size * 2)
        start = dy.concatenate([dy.lookup(self.tgt_lookup, self.tgt_token_to_id['<S>']), c_t])
        dec_state = self.dec_builder.initial_state().add_input(start)
        sentence_dict = defaultdict(lambda : defaultdict(list))
        end_sign = self.tgt_token_to_id['</S>']
        #first_iter 
        h_e = dec_state.output()
        c_t = self.__attention_mlp(h_fs_matrix, h_e)
        embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id['<S>'])
        x_t = dy.concatenate([embed_t, c_t])
        dec_state = dec_state.add_input(x_t)
        y_star = (b_y + W_y * dec_state.output())
        p = dy.log(dy.softmax(y_star)).npvalue()
        cws = np.argpartition(-p,self.beam_size)[:self.beam_size]
        #print p
       # print 'one',np.argmax(p),p[np.argmax(p)],self.tgt_id_to_token[np.argmax(p)]
       # print 'many',cws,p[cws],self.debug(cws)
        history_path = p[cws]
       # print 'history_path',history_path
        
        trans_iter = 0
        for i in range(self.beam_size):
            sentence_dict[trans_iter][i] = [self.tgt_id_to_token[cws[i]]]
        try:
            index = np.where(cws==end_sign)[0][0]
           # print index,'sentence ends normaly.'
            return ''
        except :
            #print 'no </S>'
            pass
        #print 'first dict',sentence_dict
        while trans_iter < self.max_len:
            h_e = dec_state.output()
            c_t = self.__attention_mlp(h_fs_matrix, h_e)
            y_stars = np.zeros([self.tgt_vocab_size,self.beam_size])
            #print 'before y_stars.shape',y_stars.shape
            embed_t = dy.lookup_batch(self.tgt_lookup, cws)
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            #print 'element',(b_y + W_y * dec_state.output()).npvalue().shape
            y_stars = dy.log(dy.softmax(b_y + W_y * dec_state.output())).npvalue()
            #print 'y_stars.shape',y_stars.shape
            #print 'history_path',history_path.shape
            #print 'beam',self.beam_size,self.tgt_vocab_size

            #print 'y_stars_before',y_stars.shape,y_stars
            #print 'history_path',history_path.shape, history_path
            y_stars += history_path
            #print 'y_stars_after',y_stars.shape,y_stars
            
            y_star_all = np.reshape(y_stars.T,(self.beam_size * self.tgt_vocab_size,))
            #print 'y_star_all',y_star_all.shape
            
            #print 'y_star_all_sort',np.sort(y_star_all)
            beam_indexes = np.argpartition(-y_star_all,self.beam_size)[:self.beam_size]
            beam_sentence_index = list(map( (lambda x: x/self.tgt_vocab_size),beam_indexes))
            #end_sign = np.array([self.tgt_token_to_id['</S>'],self.tgt_token_to_id['</S>']+self.tgt_vocab_size,self.tgt_token_to_id['</S>']+2*self.tgt_vocab_size,self.tgt_token_to_id['</S>']+3*self.tgt_vocab_size])
            cws = [index % self.tgt_vocab_size for index in beam_indexes]
            history_path = y_star_all[beam_indexes]       
            # print 'beam_indexes ',beam_indexes, y_star_all[beam_indexes] #y_star_all[end_sign]
            # print 'beam_sentence_index',beam_sentence_index
            # print 'history_path',history_path
            # print 'cws words',cws,end_sign
            trans_iter += 1
            #print trans_iter -1 ,'dict',sentence_dict
            max_score = -np.inf
            find_end_sign = False
            for i,cw in enumerate(cws):
                if cw == end_sign:
                    find_end_sign = True
                    if max_score < history_path[i]:
                        max_score = history_path[i]
                        max_index = i 
            if find_end_sign:
                #print 'return',trans_iter-1,max_index, beam_sentence_index[max_index]
                return ' '.join(sentence_dict[trans_iter-1][beam_sentence_index[max_index]])
            
            for i in range(self.beam_size):
                sentence_dict[trans_iter][i] = sentence_dict[trans_iter - 1][beam_sentence_index[i]] + [self.tgt_id_to_token[cws[i]]]
            #print trans_iter,'dict',sentence_dict
            #index = cws.index(self.tgt_token_to_id['</S>'])
        #print 'sentence too long'
        return ' '.join(sentence_dict[trans_iter][np.argmax(history_path)])
Exemplo n.º 55
0
 def _get_probs(self, rnn_output):
     output_w = dy.parameter(self.output_w)
     output_b = dy.parameter(self.output_b)
     probs = dy.softmax(output_w * rnn_output + output_b)
     return probs
Exemplo n.º 56
0
# regular lookup
a = lp[1].npvalue()
b = lp[2].npvalue()
c = lp[3].npvalue()

# batch lookup instead of single elements.
# two ways of doing this.
abc1 = dy.lookup_batch(lp, [1,2,3])
print(abc1.npvalue())

abc2 = lp.batch([1,2,3])
print(abc2.npvalue())

print(np.hstack([a,b,c]))


# use pick and pickneglogsoftmax in batch mode
# (must be used in conjunction with lookup_batch):
print("\nPick")
W = dy.parameter( m.add_parameters((5, 10)) )
h = W * lp.batch([1,2,3])
print(h.npvalue())
print(dy.pick_batch(h,[1,2,3]).npvalue())
print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value())

# using pickneglogsoftmax_batch
print("\nPick neg log softmax")
print((-dy.log(dy.softmax(h))).npvalue())
print(dy.pickneglogsoftmax_batch(h,[1,2,3]).npvalue())
Exemplo n.º 57
0
# number of layers in `RNN`
num_layers = 1

#pretrained embeddings
embedding_dim = emb_matrix_pretrained.shape[1]
embedding_parameters = RNN_model.lookup_parameters_from_numpy(
    emb_matrix_pretrained)

#add RNN unit
RNN_unit = dy.VanillaLSTMBuilder(num_layers, embedding_dim, hidden_size,
                                 RNN_model)

#add projection layer
# W (hidden x num_labels)
pW = RNN_model.add_parameters((hidden_size, len(list(l2i.keys()))))
dy.parameter(pW).npvalue().shape

# b (1 x num_labels)
pb = RNN_model.add_parameters((len(list(l2i.keys()))))
# note: we are just giving one dimension (ignoring the "1" dimension)
# this makes manipulating the shapes in forward_pass() easier
dy.parameter(pb).npvalue().shape

RNN_model.populate("trained.model")

batch_size = 256
num_batches_testing = int(np.ceil(len(test_tokens) / batch_size))
predictions = test()
overall_accuracy, unknown_accuracy = evaluate(predictions, test_labels,
                                              unknown_index)
endtime = datetime.datetime.now()
Exemplo n.º 58
0
trainer = dy.AdamTrainer(model)
trainer.set_clip_threshold(-1.0)
trainer.set_sparse_updates(True if args.SPARSE == 1 else False)

print("startup time: %r" % (time.time() - start))
sents = 0
all_time = 0
for ITER in range(100):
    random.shuffle(train)
    closs = 0.0
    cwords = 0
    start = time.time()
    batch = []
    for i, tree in enumerate(train, 1):
        sents += 1
        W = dy.parameter(W_)
        h, c = builder.expr_for_tree(tree, True)
        nodes = tree.nonterms()
        losses = [dy.pickneglogsoftmax(W * nt._e, l2i[nt.label]) for nt in nodes]
        loss = dy.esum(losses)
        batch.append(loss)
        if len(batch) == 50:
            loss = dy.esum(batch)
            closs += loss.value()
            cwords += len(nodes)
            loss.backward()
            trainer.update()
            batch = []
            dy.renew_cg()
        if sents % 1000 == 0:
            trainer.status()
    def beam_search_decode(self, lemma, feats, external_cg=True, unk_avg=True, beam_width=4):
        # Returns an expression of the loss for the sequence of actions.
        # (that is, the oracle_actions if present or the predicted sequence otherwise)
        def _valid_actions(encoder):
            valid_actions = []
            if len(encoder) > 1:
                valid_actions += [COPY, DELETE]
            else:
                valid_actions += [END_WORD]
            valid_actions += self.INSERTS
            return valid_actions

        if not external_cg:
            dy.renew_cg()

        # vectorize lemma
        lemma_enc = self._build_lemma(lemma, unk_avg, is_training=False)

        # vectorize features
        features = self._build_features(*feats)
            
        # add encoder and decoder to computation graph
        encoder = Encoder(self.fbuffRNN, self.bbuffRNN)
        decoder = self.wordRNN.initial_state()
        
        # encoder is a stack which pops lemma characters and their
        # representations from the top.
        encoder.transduce(lemma_enc, lemma)

        # add classifier to computation graph
        if self.MLP_DIM:
            # decoder output to hidden
            W_s2h = dy.parameter(self.pW_s2h)
            b_s2h = dy.parameter(self.pb_s2h)
        # hidden to action
        W_act = dy.parameter(self.pW_act)
        b_act = dy.parameter(self.pb_act)
    
        encoder.pop()  # BEGIN_WORD_CHAR
        
        # a list of tuples:
        #    (decoder state, encoder state, list of previous actions,
        #     log prob of previous actions, log prob of previous actions as dynet object,
        #     word generated so far)
        beam = [(decoder, encoder, [COPY], 0., 0., [])]

        beam_length = 0
        complete_hypotheses = []
        
        while beam_length <= MAX_ACTION_SEQ_LEN:
            
            if not beam or beam_width == 0:
                break
            
            #if show_oracle_actions:
            #    print 'Action: ', count, self.vocab.act.i2w[action_history[-1]]
            #    print 'Encoder length, char: ', lemma, len(encoder), self.vocab.char.i2w[encoder.s[-1][-1]]
            #    print 'word: ', u''.join(word)
            #    print 'Remaining actions: ', oracle_actions, u''.join([self.vocab.act.i2w[a] for a in oracle_actions])
            #    count += 1
            #elif action_history[-1] >= self.NUM_ACTS:
            #    print 'Will be adding unseen act embedding: ', self.vocab.act.i2w[action_history[-1]]
            
            # compute probability of each of the actions and choose an action
            # either from the oracle or if there is no oracle, based on the model
            expansion = []
            #print 'Beam length: ', beam_length
            for decoder, encoder, prev_actions, log_p, log_p_expr, word in beam:
                #print 'Expansion: ', action2string(prev_actions, self.vocab), log_p, ''.join(word)
                valid_actions = _valid_actions(encoder)
                # decoder
                decoder_input = dy.concatenate([encoder.embedding(),
                                                features,
                                                self.ACT_LOOKUP[prev_actions[-1]]
                                               ])
                decoder = decoder.add_input(decoder_input)
                # classifier
                if self.double_feats:
                    classifier_input = dy.concatenate([decoder.output(), features])
                else:
                    classifier_input = decoder.output()
                if self.MLP_DIM:
                    h = self.NONLIN(W_s2h * classifier_input + b_s2h)
                else:
                    h = classifier_input
                logits = W_act * h + b_act
                # logits_cpu = logits # dy.to_device(logits, 'CPU')
                log_probs_expr = dy.log_softmax(logits, valid_actions)
                log_probs = log_probs_expr.npvalue()
                top_actions = np.argsort(log_probs)[-beam_width:]
                #print 'top_actions: ', top_actions, action2string(top_actions, self.vocab) 
                #print 'log_probs: ', log_probs
                #print
                prev_actions_int = list(int(x) for x in prev_actions)
                top_actions_int = list(int(x) for x in top_actions)
                expansion.extend((
                    (decoder, encoder.copy(),
                     list(prev_actions_int), a, log_p + log_probs[a],
                     log_p_expr + log_probs_expr[a], list(word)) for a in list(top_actions_int)))

            #print 'Overall, {} expansions'.format(len(expansion))
            beam = []
            expansion.sort(key=lambda e: e[4])
            for e in expansion[-beam_width:]:
                decoder, encoder, prev_actions, action, log_p, log_p_expr, word = e
            
                prev_actions.append(action)

                # execute the action to update the transducer state
                if action == END_WORD:
                    # 1. Finish transduction:
                    #  * beam width should be decremented
                    #  * expansion should be taken off the beam and
                    # stored to final hypotheses set
                    beam_width -= 1
                    complete_hypotheses.append((log_p, log_p_expr, ''.join(word), prev_actions))
                else:
                    if action == COPY:
                        # 1. Increment attention index
                        char_ = encoder.pop()
                        # 2. Append copied character to the output word
                        word.append(self.vocab.char.i2w[char_])
                    elif action == DELETE:               
                        # 1. Increment attention index
                        encoder.pop()
                    else:
                        # one of the INSERT actions
                        assert action in self.INSERTS
                        # 1. Append inserted character to the output word
                        char_ = self.vocab.act.i2w[action]
                        word.append(char_)
                    beam.append((decoder, encoder, prev_actions, log_p, log_p_expr, word))
            
            beam_length += 1

        if not complete_hypotheses:
            # nothing found because the model is so crappy
            complete_hypotheses = [(log_p, log_p_expr, ''.join(word), prev_actions)
                                   for _, _, prev_actions, log_p, log_p_expr, word in beam]

        complete_hypotheses.sort(key=lambda h: h[0], reverse=True)
        #print u'Complete hypotheses:'
        #for log_p, _, word, actions in complete_hypotheses:
        #    print u'Actions {}, word {}, log p {:.3f}'.format(action2string(actions, self.vocab), word, log_p)
            
        return complete_hypotheses
Exemplo n.º 60
0
        network_nodes[n] = s
    return network_nodes, nodes[-1]


m = dy.ParameterCollection()
initial_values = [0.2, 0.8]
p = {}
for idx, val in enumerate(initial_values):
    p[idx] = m.add_parameters((1), init=dy.ConstInitializer(val))
trainer = dy.AdamTrainer(m, alpha=0.01)

for i in range(1, 4):
    print("\nSTART of Epoch", i, "\n")
    counts = {}
    for idx in p:
        print("rule", idx, "prob:", dy.parameter(p[idx]).value())
        counts[idx] = 0.0
    print()

    for elem in corpus:
        hypergraph = build_hypergraph_rec(elem, {})
        print("hypergraph for", elem, ":", hypergraph)
        network, output = build_network(p, hypergraph)
        loss = 1 * network[output]
        loss.backward()
        for n in range(1, output + 1):
            print("node", n, "value", network[n].value(), "gradient",
                  network[n].gradient())
        for r in p:
            rv = dy.parameter(p[r])
            count = (rv.gradient() * rv.value() / loss.value())[0]