示例#1
0
    def decomp_attend(self, vecsA, vecsB):
        # Fq^T Fc -> need to expedite using native matrix/tensor multiplication
        Fq = vecsA  # the original word vector, not yet passing a NN as in Eq.1, # need a function F
        Fc = vecsB  # need a function F

        expE = []
        for fq in Fq:
            row = []
            for fc in Fc:
                row.append(dt.exp(dt.dot_product(fq, fc)))
            expE.append(row)
        #print ("debug: expE", expE[0][0].value())

        invSumExpEi = []
        for i in xrange(len(Fq)):
            invSumExpEi.append(dt.pow(dt.esum(expE[i]), dt.scalarInput(-1)))

        invSumExpEj = []
        for j in xrange(len(Fc)):
            invSumExpEj.append(
                dt.pow(dt.esum([expE[i][j] for i in xrange(len(Fq))]),
                       dt.scalarInput(-1)))

        beta = []
        for i in xrange(len(Fq)):
            s = dt.esum([Fc[j] * expE[i][j] for j in xrange(len(Fc))])
            beta.append(s * invSumExpEi[i])
        #print("debug: beta", beta[0].value())

        alpha = []
        for j in xrange(len(Fc)):
            s = dt.esum([Fc[j] * expE[i][j] for i in xrange(len(Fq))])
            alpha.append(s * invSumExpEj[j])
        #print("debug: alpha", alpha[0].value())

        # Compare
        v1i = [
            dt.logistic(dt.concatenate([Fq[i], beta[i]]))
            for i in xrange(len(Fq))
        ]  # need a function G
        v2j = [
            dt.logistic(dt.concatenate([Fc[j], alpha[j]]))
            for j in xrange(len(Fc))
        ]  # need a function G

        #print ("debug: v1i", v1i[0].value())
        #print ("debug: v2j", v2j[0].value())

        # Aggregate

        v1 = dt.esum(v1i)
        v2 = dt.esum(v2j)

        #print ("debug: v1.value()", v1.value())
        #print ("debug: v2.value()", v2.value())

        #colScore = dt.logistic(dt.dot_product(self.SelHW, dt.concatenate([v1,v2])))
        return dt.dot_product(v1, v2)
示例#2
0
    def intra_sent_attend(self, vecs):
        numVecs = len(vecs)
        fVecs = [dt.tanh(self.SelIntraFW * v) for v in vecs]
        expE = []
        for i, fq in enumerate(fVecs):
            row = []
            for j, fc in enumerate(fVecs):
                row.append(
                    dt.exp(
                        dt.dot_product(fq, fc) +
                        self.SelIntraBias[i - j +
                                          int(config.d["DIST_BIAS_DIM"] / 2)]))
            expE.append(row)

        invSumExpE = []
        for i in xrange(numVecs):
            invSumExpE.append(dt.pow(dt.esum(expE[i]), dt.scalarInput(-1)))

        alpha = []
        for i in xrange(numVecs):
            s = dt.esum([vecs[j] * expE[i][j] for j in xrange(numVecs)])
            alpha.append(s * invSumExpE[i])

        return [
            dt.tanh(self.SelIntraHW * dt.concatenate([v, a]))
            for v, a in zip(vecs, alpha)
        ]
示例#3
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]

    # initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x]
                                            for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mean = dy.parameter(W_mean_p)
    V_mean = dy.parameter(V_mean_p)
    b_mean = dy.parameter(b_mean_p)

    W_var = dy.parameter(W_var_p)
    V_var = dy.parameter(V_var_p)
    b_var = dy.parameter(b_var_p)

    # The mean vector from the encoder.
    mu = mlp(src_output, W_mean, V_mean, b_mean)
    # This is the diagonal vector of the log co-variance matrix from the encoder
    # (regard this as log variance is easier for furture implementation)
    log_var = mlp(src_output, W_var, V_var, b_var)

    # Compute KL[N(u(x), sigma(x)) || N(0, I)]
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    kl_loss = -0.5 * dy.sum_elems(1 + log_var -
                                  dy.pow(mu, dy.inputVector([2])) -
                                  dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        # feed the current state into the
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    return kl_loss, softmax_loss
示例#4
0
文件: vae.py 项目: danielhers/cnn
def loss_function(recon_x, x, mu, logvar):
    BCE = dy.binary_log_loss(recon_x, x)  # equiv to torch.nn.functional.binary_cross_entropy(?,?, size_average=False)
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * dy.sum_elems(1 + logvar - dy.pow(mu, dy.scalarInput(2)) - dy.exp(logvar))

    return BCE + KLD
示例#5
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]

    # initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mean = dy.parameter(W_mean_p)
    V_mean = dy.parameter(V_mean_p)
    b_mean = dy.parameter(b_mean_p)

    W_var = dy.parameter(W_var_p)
    V_var = dy.parameter(V_var_p)
    b_var = dy.parameter(b_var_p)

    # The mean vector from the encoder.
    mu = mlp(src_output, W_mean, V_mean, b_mean)
    # This is the diagonal vector of the log co-variance matrix from the encoder
    # (regard this as log variance is easier for furture implementation)
    log_var = mlp(src_output, W_var, V_var, b_var)

    # Compute KL[N(u(x), sigma(x)) || N(0, I)]
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        # feed the current state into the
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    return kl_loss, softmax_loss
示例#6
0
def loss_function(recon_x, x, mu, logvar):
    BCE = dy.binary_log_loss(
        recon_x, x
    )  # equiv to torch.nn.functional.binary_cross_entropy(?,?, size_average=False)
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * dy.sum_elems(1 + logvar - dy.pow(mu, dy.scalarInput(2)) -
                              dy.exp(logvar))

    return BCE + KLD
示例#7
0
    def learn(self, batch_size):
        if self.prioritized:
            if not self.memory.is_full(): return -np.inf
            indices, exps, weights = self.memory.sample(batch_size, self.beta)
        else:
            exps = self.memory.sample(batch_size)
        obss, actions, rewards, obs_nexts, dones = self._process(exps)

        dy.renew_cg()
        target_network = self.target_network if self.use_double_dqn else self.network
        if self.dueling:
            target_values, v = target_network(obs_nexts, batched=True)
            target_values = target_values.npvalue() + v.npvalue()
        else:
            target_values = target_network(obs_nexts, batched=True)
            target_values = target_values.npvalue()
        target_values = np.max(target_values, axis=0)
        target_values = rewards + self.reward_decay * (target_values *
                                                       (1 - dones))

        dy.renew_cg()
        if self.dueling:
            all_values_expr, v = self.network(obss, batched=True)
        else:
            all_values_expr = self.network(obss, batched=True)
        picked_values = dy.pick_batch(all_values_expr, actions)
        diff = (picked_values + v if self.dueling else
                picked_values) - dy.inputTensor(target_values, batched=True)
        if self.prioritized:
            self.memory.update(indices, np.transpose(np.abs(diff.npvalue())))
        losses = dy.pow(diff, dy.constant(1, 2))
        if self.prioritized:
            losses = dy.cmult(losses, dy.inputTensor(weights, batched=True))
        loss = dy.sum_batches(losses)
        loss_value = loss.npvalue()
        loss.backward()
        self.trainer.update()

        self.epsilon = max(self.epsilon - self.epsilon_decrease,
                           self.epsilon_lower)
        if self.prioritized:
            self.beta = min(self.beta + self.beta_increase, 1.)

        self.learn_step += 1
        if self.use_double_dqn and self.learn_step % self.n_replace_target == 0:
            self.target_network.update(self.network)
        return loss_value
示例#8
0
    def calc_loss_basic(self, frames, label):

        # Renew the computation graph
        dy.renew_cg()

        # Initialize LSTM
        init_state_src = self.lstm_builder.initial_state()

        # Instantiate the params
        W_mean = dy.parameter(self.W_mean_p)
        V_mean = dy.parameter(self.V_mean_p)
        b_mean = dy.parameter(self.b_mean_p)
        W_var = dy.parameter(self.W_var_p)
        V_var = dy.parameter(self.V_var_p)
        b_var = dy.parameter(self.b_var_p)

        input_frames = dy.inputTensor(frames)
        output_label = label

        # Get the LSTM embeddings
        src_output = init_state_src.add_inputs(
            [frame for frame in input_frames])[-1].output()

        # Get the mean and diagonal log covariance from the encoder
        mu = self.mlp(src_output, W_mean, V_mean, b_mean)
        log_var = self.mlp(src_output, W_mean, V_mean, b_mean)

        # Compute the KL Divergence loss
        kl_loss = -0.5 * dy.sum_elems(1 + log_var -
                                      dy.pow(mu, dy.inputVector([2])) -
                                      dy.exp(log_var))

        # Reparameterize
        z = self.reparameterize(mu, log_var)

        W_sm = dy.parameter(self.W_sm_p)
        b_sm = dy.parameter(self.b_sm_p)

        # Calculate the reconstruction loss
        pred = dy.affine_transform([b_sm, W_sm, z])
        label_embedding = self.lookup[label]
        #print label, label_embedding
        recons_loss = dy.pickneglogsoftmax(pred, label)

        return kl_loss, recons_loss
示例#9
0
文件: dqn.py 项目: danielhers/cnn
    def learn(self, batch_size):
        if self.prioritized:
            if not self.memory.is_full(): return -np.inf
            indices, exps, weights = self.memory.sample(batch_size, self.beta)
        else:
            exps = self.memory.sample(batch_size)
        obss, actions, rewards, obs_nexts, dones = self._process(exps)

        dy.renew_cg()
        target_network = self.target_network if self.use_double_dqn else self.network
        if self.dueling:
            target_values, v = target_network(obs_nexts, batched=True)
            target_values = target_values.npvalue() + v.npvalue()
        else:
            target_values = target_network(obs_nexts, batched=True)
            target_values = target_values.npvalue()
        target_values = np.max(target_values, axis=0)
        target_values = rewards + self.reward_decay * (target_values * (1 - dones))

        dy.renew_cg()
        if self.dueling:
            all_values_expr, v = self.network(obss, batched=True)
        else:
            all_values_expr = self.network(obss, batched=True)
        picked_values = dy.pick_batch(all_values_expr, actions)
        diff = (picked_values + v if self.dueling else picked_values) - dy.inputTensor(target_values, batched=True)
        if self.prioritized:
            self.memory.update(indices, np.transpose(np.abs(diff.npvalue())))
        losses = dy.pow(diff, dy.constant(1, 2))
        if self.prioritized:
            losses = dy.cmult(losses, dy.inputTensor(weights, batched=True))
        loss = dy.sum_batches(losses)
        loss_value = loss.npvalue()
        loss.backward()
        self.trainer.update()

        self.epsilon = max(self.epsilon - self.epsilon_decrease, self.epsilon_lower)
        if self.prioritized:
            self.beta = min(self.beta + self.beta_increase, 1.)

        self.learn_step += 1
        if self.use_double_dqn and self.learn_step % self.n_replace_target == 0:
            self.target_network.update(self.network)
        return loss_value
示例#10
0
    def learn(self, src, dst):
        softmax_list, aux_list = self._predict(src, dst=dst,  num_predictions=len(dst) + 1, runtime=False)
        for softmax, aux, entry in zip(softmax_list, aux_list, dst):
            word = entry.word.decode('utf-8').lower()
            if word in self.output_encodings.word2int:
                w_index = self.output_encodings.word2int[word]
            else:
                w_index = self.output_encodings.word2int["<UNK>"]

            w_emb, found = self.dst_we.get_word_embeddings(entry.word.decode('utf-8'))
            self.losses.append(-dy.log(dy.pick(softmax, w_index)))
            if found:
                vec1=aux
                vec2=dy.inputVector(w_emb)
                cosine = dy.dot_product(vec1, vec2) * dy.pow(dy.l2_norm(vec1) * dy.l2_norm(vec2),
                                                                       dy.scalarInput(-1))
                self.losses.append(dy.squared_distance(cosine, dy.scalarInput(1.0)))


        self.losses.append(-dy.log(dy.pick(softmax_list[-1], self.EOS)))
示例#11
0
    def calc_loss_basic(self, embedding, label):

        # Renew the computation graph
        dy.renew_cg()

        # Instantiate the params
        W_mean = dy.parameter(self.W_mean_p)
        V_mean = dy.parameter(self.V_mean_p)
        b_mean = dy.parameter(self.b_mean_p)
        W_var = dy.parameter(self.W_var_p)
        V_var = dy.parameter(self.V_var_p)
        b_var = dy.parameter(self.b_var_p)

        input_embedding = dy.inputTensor(embedding)
        output_label = label

        # Get the LSTM embeddings
        src_output = self.dnn.predict(input_embedding)

        # Get the mean and diagonal log covariance from the encoder
        mu = self.mlp(src_output, W_mean, V_mean, b_mean)
        log_var = self.mlp(src_output, W_mean, V_mean, b_mean)

        # Compute the KL Divergence loss
        kl_loss = -0.5 * dy.sum_elems(1 + log_var -
                                      dy.pow(mu, dy.inputVector([2])) -
                                      dy.exp(log_var))

        # Reparameterize
        z = self.reparameterize(mu, log_var)

        W_sm = dy.parameter(self.W_sm_p)
        b_sm = dy.parameter(self.b_sm_p)

        # Calculate the reconstruction loss
        pred = dy.selu(dy.affine_transform([b_sm, W_sm, z]))
        label_embedding = self.lookup[label]
        recons_loss = dy.pickneglogsoftmax(pred, label)

        return kl_loss, recons_loss
    def get_regularization(self, batch_preds, word2int):
        reg = 0
        # compute regularization on the parameters
        for key, value in self.params.iteritems():
            if key != "b" and key != "lookup":
                expression = dy.parameter(value)
                reg += dy.sum_elems(dy.pow(expression, dy.scalarInput(2)))
            if key == "lookup":
                for example in batch_preds:
                    premise = example[0]
                    hypothesis = example[1]
                    premise_seq = [value[word2int.get(i)] for i in premise]
                    hypothesis_seq = [
                        value[word2int.get(i)] for i in hypothesis
                    ]
                    for exp in premise_seq:
                        reg += dy.sum_elems(dy.pow(exp, dy.scalarInput(2)))
                    for exp in hypothesis_seq:
                        reg += dy.sum_elems(dy.pow(exp, dy.scalarInput(2)))

        # compute regularization on the bilstm terms
        for i in range(2):
            weights_prem_fw = self.fw_premise_builder.get_parameter_expressions(
            )[0][i]
            weights_prem_bw = self.bw_premise_builder.get_parameter_expressions(
            )[0][i]
            weights_hypo_fw = self.fw_hypo_builder.get_parameter_expressions(
            )[0][i]
            weights_hypo_bw = self.bw_hypo_builder.get_parameter_expressions(
            )[0][i]
            reg += dy.sum_elems(dy.pow(weights_prem_fw, dy.scalarInput(2)))
            reg += dy.sum_elems(dy.pow(weights_prem_bw, dy.scalarInput(2)))
            reg += dy.sum_elems(dy.pow(weights_hypo_fw, dy.scalarInput(2)))
            reg += dy.sum_elems(dy.pow(weights_hypo_bw, dy.scalarInput(2)))

        return reg
示例#13
0
        def MRT_batch_update(batch, epoch):

            dy.renew_cg()

            alpha = dy.scalarInput(alpha_p)

            batch_loss = []
            rewards = []
            for sample in batch:

                lemma = sample.lemma
                word = sample.word
                word_str = sample.word_str
                feats = sample.pos, sample.feats
                actions = sample.actions

                # ORACLE PREDICTION
                #loss, prediction_b, predicted_actions_b = \
                gold_loss, _, _ = \
                    self.transducer.transduce(lemma, feats, actions, external_cg=True)
                gold_loss = dy.esum(gold_loss)
                #if gold_loss.scalar_value() < -50.:  # Sum log P
                #    print 'Dangerously low prob of gold action seq: ', gold_loss.scalar_value(), word_str
                #    hypotheses = []
                #else:
                #    hypotheses = [ (_, gold_loss, word_str, actions) ]

                # BEAM-SEARCH-BASED PREDICTION
                #hypotheses += self.transducer.beam_search_decode(lemma, feats, external_cg=True,
                #                                                 beam_width=beam_width)
                sample_rewards = [-1.]
                sample_losses = [gold_loss]
                predictions = [word_str]
                seen_predicted_acts = {tuple(actions)}
                #for _, loss, prediction, predicted_actions in hypotheses:
                for _ in range(sample_size):
                    loss, prediction, predicted_actions = \
                        self.transducer.transduce(lemma, feats, sampling=True, external_cg=True)
                    predicted_actions = tuple(predicted_actions)
                    if predicted_actions in seen_predicted_acts:
                        #if verbose: print 'already sampled this action sequence: ', predicted_actions
                        continue
                    loss = dy.esum(loss)
                    if loss.scalar_value() < -20:  # log P
                        continue
                    else:
                        seen_predicted_acts.add(predicted_actions)
                #for _, loss, prediction, predicted_actions in hypotheses:

                # COMPUTE REWARDS
                    reward = compute_reward(word, word_str, prediction)

                    sample_rewards.append(reward)
                    sample_losses.append(loss)
                    predictions.append(prediction)

                # SCALE & RENORMALIZE: (these are log P)
                if len(sample_rewards) == 1 and sample_rewards[0] == -1.:
                    if verbose: print 'Nothing to update with.'
                    continue
                else:
                    #if verbose: print 'sample_losses', sample_losses
                    sample_losses = dy.concatenate(sample_losses)
                    sample_rewards = dy.inputVector(sample_rewards)
                    q_unnorm = dy.pow(dy.exp(sample_losses), alpha)
                    q = dy.cdiv(q_unnorm, dy.sum_elems(q_unnorm))

                    if verbose:
                        print 'q', q.npvalue()
                        print 'sample_rewards', sample_rewards.npvalue()
                        print 'word', word_str
                        print 'predictions: ', u', '.join(predictions)
                    batch_loss.append(dy.dot_product(q, sample_rewards))
            if batch_loss:
                batch_loss = dy.esum(batch_loss)
                loss = batch_loss.scalar_value()  # forward
                try:
                    batch_loss.backward()
                    self.trainer.update()
                except Exception, e:
                    print 'Batch loss: ', loss
                    print 'q', q.npvalue()
                    print 'q_unnorm', q_unnorm.npvalue()
                    print 'gold_loss', gold_loss.scalar_value()
                    print 'sample_rewards', sample_rewards.npvalue()
                    print 'word', word_str
                    print 'predictions: ', u', '.join(predictions)
                    raise e
                if verbose: print 'Batch loss: ', loss
示例#14
0
文件: gelu.py 项目: chrikoehn/antu
 def __call__(self, x):
     return 0.5 * x * (1 + dy.tanh(
         math.sqrt(2 / math.pi) * (x + 0.044715 * dy.pow(x, 3))))
def hallucinate_tags(tweet, sample=False, print_loss=False):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = tweet

    # initialize the LSTM
    init_state_src = lstm_encode.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([embed[x]
                                            for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mu_tweet = dy.parameter(W_mu_tweet_p)
    V_mu_tweet = dy.parameter(V_mu_tweet_p)
    b_mu_tweet = dy.parameter(b_mu_tweet_p)

    W_sig_tweet = dy.parameter(W_sig_tweet_p)
    V_sig_tweet = dy.parameter(V_sig_tweet_p)
    b_sig_tweet = dy.parameter(b_sig_tweet_p)

    # Compute tweet encoding
    mu_tweet = mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet)
    log_var_tweet = mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet)

    #W_mu_tag = dy.parameter(W_mu_tag_p)
    #V_mu_tag = dy.parameter(V_mu_tag_p)
    #b_mu_tag = dy.parameter(b_mu_tag_p)

    #W_sig_tag = dy.parameter(W_sig_tag_p)
    #V_sig_tag = dy.parameter(V_sig_tag_p)
    #b_sig_tag = dy.parameter(b_sig_tag_p)

    # Compute tag encoding
    #tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags),)), (NUM_TAGS,))

    #mu_tag      = dy.dropout(mlp(tags_tensor, W_mu_tag,  V_mu_tag,  b_mu_tag), DROPOUT)
    #log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag), DROPOUT)

    # Combine encodings for mean and diagonal covariance
    W_mu = dy.parameter(W_mu_p)
    b_mu = dy.parameter(b_mu_p)

    W_sig = dy.parameter(W_sig_p)
    b_sig = dy.parameter(b_sig_p)

    mu_tag = dy.zeros(HIDDEN_DIM)
    log_var_tag = dy.zeros(HIDDEN_DIM)

    mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])])
    log_var = dy.affine_transform(
        [b_sig, W_sig,
         dy.concatenate([log_var_tweet, log_var_tag])])

    # KL-Divergence loss computation
    kl_loss = -0.5 * dy.sum_elems(1 + log_var -
                                  dy.pow(mu, dy.inputVector([2])) -
                                  dy.exp(log_var))

    if print_loss:
        print("kl loss/word={:.4f}".format(kl_loss.value() / len(tweet)))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    #     all_losses = []

    #current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)])
    #prev_word = src[0]
    #W_sm = dy.parameter(W_tweet_softmax_p)
    #b_sm = dy.parameter(b_tweet_softmax_p)

    #for next_word in src[1:]:
    #    # feed the current state into the
    #    current_state = current_state.add_input(embed[prev_word])
    #    output_embedding = current_state.output()

    #    s = dy.affine_transform([b_sm, W_sm, output_embedding])
    #    all_losses.append(dy.pickneglogsoftmax(s, next_word))

    #    prev_word = next_word

    #softmax_loss = dy.esum(all_losses)

    W_hidden = dy.parameter(W_hidden_p)
    b_hidden = dy.parameter(b_hidden_p)

    W_out = dy.parameter(W_tag_output_p)
    b_out = dy.parameter(b_tag_output_p)

    h = dy.tanh(b_hidden + W_hidden * z)
    o = dy.logistic(b_out + W_out * h)

    tag_ranks = o.value()

    # Sample from tags
    if sample:
        print('Sampling')
        gen_tags = []
        for i, p in enumerate(tag_ranks):
            if random.random() < p:
                gen_tags.append(i)

        return gen_tags

    else:
        return tag_ranks
def calc_loss(sent, epsilon=0.0):
    #dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    tags = sent[1]

    # initialize the LSTM
    init_state_src = lstm_encode.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([embed[x]
                                            for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mu_tweet = dy.parameter(W_mu_tweet_p)
    V_mu_tweet = dy.parameter(V_mu_tweet_p)
    b_mu_tweet = dy.parameter(b_mu_tweet_p)

    W_sig_tweet = dy.parameter(W_sig_tweet_p)
    V_sig_tweet = dy.parameter(V_sig_tweet_p)
    b_sig_tweet = dy.parameter(b_sig_tweet_p)

    # Compute tweet encoding
    mu_tweet = dy.dropout(mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet),
                          DROPOUT)
    log_var_tweet = dy.dropout(
        mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet), DROPOUT)

    W_mu_tag = dy.parameter(W_mu_tag_p)
    V_mu_tag = dy.parameter(V_mu_tag_p)
    b_mu_tag = dy.parameter(b_mu_tag_p)

    W_sig_tag = dy.parameter(W_sig_tag_p)
    V_sig_tag = dy.parameter(V_sig_tag_p)
    b_sig_tag = dy.parameter(b_sig_tag_p)

    # Compute tag encoding
    tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags), )),
                                        (NUM_TAGS, ))

    mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag),
                        DROPOUT)
    log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag),
                             DROPOUT)

    # Combine encodings for mean and diagonal covariance
    W_mu = dy.parameter(W_mu_p)
    b_mu = dy.parameter(b_mu_p)

    W_sig = dy.parameter(W_sig_p)
    b_sig = dy.parameter(b_sig_p)

    # Slowly phase out getting both inputs
    if random.random() < epsilon:
        mask = dy.zeros(HIDDEN_DIM)
    else:
        mask = dy.ones(HIDDEN_DIM)

    if random.random() < 0.5:
        mu_tweet = dy.cmult(mu_tweet, mask)
        log_var_tweet = dy.cmult(log_var_tweet, mask)
    else:
        mu_tag = dy.cmult(mu_tag, mask)
        log_var_tag = dy.cmult(log_var_tag, mask)

    mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])])
    log_var = dy.affine_transform(
        [b_sig, W_sig,
         dy.concatenate([log_var_tweet, log_var_tag])])

    # KL-Divergence loss computation
    kl_loss = -0.5 * dy.sum_elems(1 + log_var -
                                  dy.pow(mu, dy.inputVector([2])) -
                                  dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)])
    prev_word = src[0]
    W_sm = dy.parameter(W_tweet_softmax_p)
    b_sm = dy.parameter(b_tweet_softmax_p)

    for next_word in src[1:]:
        # feed the current state into the

        current_state = current_state.add_input(embed[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])

        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        # Slowly phase out teacher forcing (this may be slow??)
        if random.random() < epsilon:
            p = dy.softmax(s).npvalue()
            prev_word = np.random.choice(VOCAB_SIZE, p=p / p.sum())
        else:
            prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    W_hidden = dy.parameter(W_hidden_p)
    b_hidden = dy.parameter(b_hidden_p)

    W_out = dy.parameter(W_tag_output_p)
    b_out = dy.parameter(b_tag_output_p)

    h = dy.dropout(dy.tanh(b_hidden + W_hidden * z), DROPOUT)
    o = dy.logistic(b_out + W_out * h)

    crossentropy_loss = dy.binary_log_loss(o, tags_tensor)

    return kl_loss, softmax_loss, crossentropy_loss
def hallucinate_tweet(given_tags):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    tags = given_tags

    # initialize the LSTM
    #init_state_src = lstm_encode.initial_state()

    # get the output of the first LSTM
    #src_output = init_state_src.add_inputs([embed[x] for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    #W_mu_tweet = dy.parameter(W_mu_tweet_p)
    #V_mu_tweet = dy.parameter(V_mu_tweet_p)
    #b_mu_tweet = dy.parameter(b_mu_tweet_p)

    #W_sig_tweet = dy.parameter(W_sig_tweet_p)
    #V_sig_tweet = dy.parameter(V_sig_tweet_p)
    #b_sig_tweet = dy.parameter(b_sig_tweet_p)

    # Compute tweet encoding
    #mu_tweet      = mlp(src_output, W_mu_tweet,  V_mu_tweet,  b_mu_tweet)
    #log_var_tweet = mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet)

    W_mu_tag = dy.parameter(W_mu_tag_p)
    V_mu_tag = dy.parameter(V_mu_tag_p)
    b_mu_tag = dy.parameter(b_mu_tag_p)

    W_sig_tag = dy.parameter(W_sig_tag_p)
    V_sig_tag = dy.parameter(V_sig_tag_p)
    b_sig_tag = dy.parameter(b_sig_tag_p)

    # Compute tag encoding
    tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags), )),
                                        (NUM_TAGS, ))

    mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag),
                        DROPOUT)
    log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag),
                             DROPOUT)

    # Combine encodings for mean and diagonal covariance
    W_mu = dy.parameter(W_mu_p)
    b_mu = dy.parameter(b_mu_p)

    W_sig = dy.parameter(W_sig_p)
    b_sig = dy.parameter(b_sig_p)

    mu_tweet = dy.zeros(HIDDEN_DIM)
    log_var_tweet = dy.zeros(HIDDEN_DIM)

    mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])])
    log_var = dy.affine_transform(
        [b_sig, W_sig,
         dy.concatenate([log_var_tweet, log_var_tag])])

    # KL-Divergence loss computation
    kl_loss = -0.5 * dy.sum_elems(1 + log_var -
                                  dy.pow(mu, dy.inputVector([2])) -
                                  dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)])
    prev_word = vocab[START]
    W_sm = dy.parameter(W_tweet_softmax_p)
    b_sm = dy.parameter(b_tweet_softmax_p)

    gen_tweet = []
    for i in range(20):
        # feed the current state into the
        current_state = current_state.add_input(embed[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        p = dy.softmax(s).npvalue()
        next_word = np.random.choice(VOCAB_SIZE, p=p / p.sum())
        gen_tweet.append(next_word)
        prev_word = next_word

    return gen_tweet
        def MRT_batch_update(batch, epoch):

            dy.renew_cg()

            alpha = dy.scalarInput(alpha_p)

            batch_loss = []
            rewards = []
            for sample in batch:

                lemma = sample.lemma
                word = sample.word
                word_str = sample.word_str
                feats = sample.pos, sample.feats
                actions = sample.actions

                # ORACLE PREDICTION
                #loss, prediction_b, predicted_actions_b = \
                gold_loss, _, _ = \
                    self.transducer.transduce(lemma, feats, actions, external_cg=True)
                gold_loss = dy.esum(gold_loss)
                #if gold_loss.scalar_value() < -50.:  # Sum log P
                #    print 'Dangerously low prob of gold action seq: ', gold_loss.scalar_value(), word_str
                #    hypotheses = []
                #else:
                #    hypotheses = [ (_, gold_loss, word_str, actions) ]

                # BEAM-SEARCH-BASED PREDICTION
                #hypotheses += self.transducer.beam_search_decode(lemma, feats, external_cg=True,
                #                                                 beam_width=beam_width)
                if action_penalty:
                    # we will add edit cost to penalize long and intuitively wasteful actions
                    sample_actions = [cost_actions(actions)]
                sample_rewards = [-1.]
                sample_losses = [gold_loss]
                predictions = [word_str]
                seen_predicted_acts = {tuple(actions)}
                #for _, loss, prediction, predicted_actions in hypotheses:
                for _ in range(sample_size):
                    loss, prediction, predicted_actions = \
                        self.transducer.transduce(lemma, feats, sampling=True, external_cg=True)
                    predicted_actions = tuple(predicted_actions)
                    if predicted_actions in seen_predicted_acts:
                        #if verbose: print 'already sampled this action sequence: ', predicted_actions
                        continue
                    loss = dy.esum(loss)
                    if loss.scalar_value() < -20:  # log P
                        continue
                    else:
                        seen_predicted_acts.add(predicted_actions)
                #for _, loss, prediction, predicted_actions in hypotheses:

                # COMPUTE REWARDS
                    reward = compute_reward(word, word_str, prediction)

                    sample_rewards.append(reward)
                    sample_losses.append(loss)
                    predictions.append(prediction)
                    if action_penalty:
                        sample_actions.append(cost_actions(predicted_actions))

                # SCALE & RENORMALIZE: (these are log P)
                if len(sample_rewards) == 1 and sample_rewards[0] == -1.:
                    if verbose: print('Nothing to update with.')
                    continue
                else:
                    if action_penalty:
                        #X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
                        # min-max scaling to [0, 1]
                        #print 'Sampled actions: ', sample_actions
                        if len(set(sample_actions)) == 1:
                            sample_actions = np.zeros_like(sample_actions)
                        else:
                            sample_actions = np.array(sample_actions)
                            min_score = np.min(sample_actions)
                            max_score = np.max(sample_actions)
                            sample_actions = (sample_actions - min_score) / (
                                max_score - min_score)
                        #print 'Sampled actions: ', sample_actions
                        sample_rewards = (1 - action_penalty) * np.array(
                            sample_rewards) + action_penalty * sample_actions
                        #print 'Sampled rewards: ', sample_rewards
                    #if verbose: print 'sample_losses', sample_losses
                    sample_losses = dy.concatenate(sample_losses)
                    sample_rewards = dy.inputVector(sample_rewards)
                    q_unnorm = dy.pow(dy.exp(sample_losses), alpha)
                    q = dy.cdiv(q_unnorm, dy.sum_elems(q_unnorm))

                    if verbose:
                        print('q', q.npvalue())
                        print('sample_rewards', sample_rewards.npvalue())
                        print('word', word_str)
                        print('predictions: ', u', '.join(predictions))
                    batch_loss.append(dy.dot_product(q, sample_rewards))
            if batch_loss:
                batch_loss = dy.esum(batch_loss)
                loss = batch_loss.scalar_value()  # forward
                try:
                    batch_loss.backward()
                    self.trainer.update()
                except Exception as e:
                    print('Batch loss: ', loss)
                    print('q', q.npvalue())
                    print('q_unnorm', q_unnorm.npvalue())
                    print('gold_loss', gold_loss.scalar_value())
                    print('sample_rewards', sample_rewards.npvalue())
                    print('word', word_str)
                    print('predictions: ', u', '.join(predictions))
                    raise e
                if verbose: print('Batch loss: ', loss)
            else:
                if verbose: print('Batch loss is zero.')
                loss = 0.
            return loss