예제 #1
0
class EncoderDecoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, vocab_size, gru_layers):
        super(EncoderDecoder, self).__init__()
        self.encoder = Encoder(embedding_size)
        self.decoder = Decoder(embedding_size, hidden_size, vocab_size,
                               gru_layers)

    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs

    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []

        with torch.no_grad():
            x = self.encoder(image).unsqueeze(0)
            states = None

            for _ in range(max_length):
                hiddens, states = self.decoder.gru(x, states)
                output = self.decoder.linear(hiddens.squeeze(0))
                predicted = output.argmax(1)
                item = predicted.item()
                result_caption.append(item)
                x = self.decoder.embed(predicted).unsqueeze(0)

                if vocabulary.itos[item] == "<end>":
                    break

            return [vocabulary.itos[idx] for idx in result_caption]
예제 #2
0
class ParaphraseModel(nn.Module):
    ''' 
        dicstionary_size = the size of the dictionary in the dataset
        embedding_dim = each word in the dictionary is embedded in a vector space with that dimension
        rnn_hidden_size = 
        rnn_num_layers = the numbers of the layers in the each LSTM in the model
        z_dim = the encoder encodes the sentence to a z-vector space with that dimension
    '''
    def __init__(self,
                 dictionary_size=100,
                 embedding_dim=1100,
                 rnn_hidden_size=600,
                 rnn_num_layers=2,
                 z_dim=1100):  #Does embedding_dim should be the same as z_dim?
        super(ParaphraseModel, self).__init__()
        self.embedding = nn.Embedding(
            dictionary_size,
            embedding_dim)  #should be replaced in word embedding like word2vec
        self.encoder = Encoder(embedding_dim, rnn_hidden_size, rnn_num_layers,
                               z_dim)
        self.decoder = Decoder(embedding_dim, rnn_hidden_size, rnn_num_layers,
                               dictionary_size)
        self.cel = nn.CrossEntropyLoss(ignore_index=-1)  #cross entrpoy
        self.dictionary_size = dictionary_size
        self.embedding_dim = embedding_dim

    def train_model(self, xo, xp, xo_len, xp_len, kld_coef=1):
        logits, z, mu, logvar = self.AE_forward(xo, xp, xo_len, xp_len)

        cel_loss = self.cel(
            logits.view(-1, self.dictionary_size).contiguous(),
            xp.cuda().view(-1))
        kl_loss = -0.5 * (1 + logvar - mu.pow(2) - logvar.exp()).sum(1).mean()
        total_loss = cel_loss + kld_coef * kl_loss
        #print(cel_loss, kl_loss)
        return total_loss

    def AE_forward(self, xo, xp, xo_len, xp_len):
        xo_embed = self.embedding(xo.cuda())
        xp_embed = self.embedding(xp.cuda())

        mu, logvar = self.encoder(xo_embed, xp_embed, xo_len, xp_len)
        std = torch.exp(0.5 * logvar)
        nd = Normal(torch.ones_like(mu), torch.zeros_like(std))
        z = nd.sample() * std + mu
        logits = self.decoder(xo_embed, z, xo_len, xp_len)
        return logits, z, mu, logvar

    def infer(self, xo, xo_len):
        xo_embed = self.embedding(xo.cuda())
        _, (hT, cT) = self.decoder.ose(xo_embed, xo_len)
        completed_sentences = torch.zeros(len(xo_embed))
        sentences = []
        mu, sigma = torch.zeros(len(xo), self.embedding_dim), torch.ones(
            len(xo), self.embedding_dim)
        nd = Normal(mu, sigma)
        z = nd.sample().cuda()
        out = hT[-1]
        steps = 0
        while not all(completed_sentences):
            real_inp = torch.cat((z, out), 1).unsqueeze(1)
            output, (hT, cT) = self.decoder.pse(real_inp,
                                                torch.tensor([1] * len(z)),
                                                h0=hT,
                                                c0=cT)
            out = hT[-1]
            probs = self.decoder.linear(out)

            topwords = [word_probs.topk(1)[1] for word_probs in probs]
            for j, result in enumerate(topwords):
                if int(result) == EOS_TOKEN:
                    completed_sentences[j] = 1

            sentences.append(topwords)
            steps += 1
            if steps == MAX_PARA_LENGTH:
                break

        return sentences