Exemplo n.º 1
0
    def __init__(self):
        super(Seq2Seq, self).__init__()

        full_text, tokens = self.read_data()
        dataset = Seq2SeqDataset()
        self.dataloader = DataLoader(dataset=dataset,
                                     batch_size=1,
                                     shuffle=True)
        self.SOS = '_'
        self.char2index = {}
        self.char2index[self.SOS] = 0
        token_idx = {w: i for i, w in enumerate(tokens)}
        self.char2index.update(token_idx)
        self.index2char = {w[1]: w[0] for w in self.char2index.items()}

        use_cuda = torch.cuda.is_available()
        self.device = torch.device('cuda:0' if use_cuda else 'cpu')

        self.input_size = len(self.char2index)
        self.hidden_size = 64
        self.n_layers = 1
        self.learning_rate = 0.01
        self.n_epoch = 1000

        self.encoder = EncoderRNN(self.input_size, self.hidden_size,
                                  self.n_layers)
        self.decoder = DecoderRNN(self.hidden_size, len(tokens), self.n_layers)

        self.encoder = self.encoder.to(self.device)
        self.decoder = self.decoder.to(self.device)
Exemplo n.º 2
0
    def __init__(self,
                 sos_id,
                 eou_id,
                 src_vocab_size,
                 tgt_vocab_size,
                 hidden_size,
                 embed_size,
                 max_len,
                 beam_size=3,
                 enc_n_layers=2,
                 enc_dropout=0.2,
                 enc_bidirectional=True,
                 dec_n_layers=2,
                 dec_dropout=0.2,
                 dec_bidirectional=True,
                 teacher_forcing_ratio=0):
        super(Generator, self).__init__()

        self.sos_id = sos_id
        self.src_vocab_size = src_vocab_size
        self.tgt_vocab_size = tgt_vocab_size
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.max_len = max_len

        self.encoder = EncoderRNN(src_vocab_size, max_len - 1, hidden_size, 0,
                                  enc_dropout, enc_n_layers, True, 'gru',
                                  False, None)
        self.decoder = DecoderRNN(
            tgt_vocab_size, max_len - 1,
            hidden_size * 2 if dec_bidirectional else hidden_size, sos_id,
            eou_id, dec_n_layers, 'gru', dec_bidirectional, 0, dec_dropout,
            True)
        # self.beam_decoder = TopKDecoder(self.decoder, beam_size)
        self.seq2seq = Seq2seq(self.encoder, self.decoder)
Exemplo n.º 3
0
    def __init__(self):
        self.data = PrepareData()
        self.dataset = Seq2SeqDataset()
        self.data_loader = DataLoader(dataset=self.dataset,
                                      batch_size=1,
                                      shuffle=True)
        self.lang_1 = data.lang_1
        self.lang_2 = data.lang_2
        self.char2index = data.char2index
        self.index2char = data.index2char

        self.input_size = 100
        self.hidden_size = 64
        self.output_size = 100
        self.learning_rate = 0.01
        self.num_epoch = 500
        self.teacher_forcing = True
        self.use_cuda = torch.cuda.is_available()
        self.device = 'cuda:0' if self.use_cuda else 'cpu'

        self.encoder = EncoderRNN(input_size=self.input_size, hidden_size=self.hidden_size)
        self.decoder = DecoderRNN(output_size=self.output_size, hidden_size=self.hidden_size)
        self.attn_decoder = AttnDecoder(self.hidden_size, self.output_size)

        if use_cuda:
            self.encoder = self.encoder.to(self.device)
            self.decoder = self.decoder.to(self.device)

        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=self.learning_rate)
        self.decoder_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=self.learning_rate)

        self.loss_function = nn.NLLLoss()
Exemplo n.º 4
0
def create_model(sourceVocabClass, targetVocabClass):
    """Create translation model and initialize or load parameters in session."""
    # Prepare src char vocabulary and target vocabulary dataset
    max_len = 1000

    # Initialize model
    hidden_size = FLAGS.size
    bidirectional = False
    srcField = SourceField(use_vocab=True)
    tgtField = TargetField(use_vocab=True)

    srcField.vocab = sourceVocabClass
    tgtField.vocab = targetVocabClass

    srcField.set_specials(_SPECIALS)
    tgtField.set_specials(_SPECIALS)

    encoder = EncoderRNN(FLAGS.char_vocab_size,
                         max_len,
                         hidden_size,
                         n_layers=FLAGS.num_layers,
                         bidirectional=bidirectional,
                         variable_lengths=True)
    decoder = DecoderRNN(FLAGS.lang_vocab_size,
                         max_len,
                         hidden_size,
                         dropout_p=0.2,
                         use_attention=True,
                         bidirectional=bidirectional,
                         eos_id=tgtField.vocab.stoi[tgtField.eos_token],
                         sos_id=tgtField.vocab.stoi[tgtField.sos_token],
                         n_layers=FLAGS.num_layers)

    seq2seqModel = seq2seq(encoder, decoder)
    if torch.cuda.is_available():
        seq2seqModel.cuda()

    for param in seq2seqModel.parameters():
        param.data.uniform_(-0.08, 0.08)

    # Prepare loss
    weight = torch.ones(FLAGS.lang_vocab_size)
    # loss = NLLLoss(weight, tgtField.vocab.stoi[tgtField.pad_token])
    loss = Perplexity(weight, tgtField.vocab.stoi[tgtField.pad_token])

    return seq2seqModel, loss, srcField, tgtField
Exemplo n.º 5
0
class Generator(nn.Module):
    def __init__(self,
                 sos_id,
                 eou_id,
                 src_vocab_size,
                 tgt_vocab_size,
                 hidden_size,
                 embed_size,
                 max_len,
                 beam_size=3,
                 enc_n_layers=2,
                 enc_dropout=0.2,
                 enc_bidirectional=True,
                 dec_n_layers=2,
                 dec_dropout=0.2,
                 dec_bidirectional=True,
                 teacher_forcing_ratio=0):
        super(Generator, self).__init__()

        self.sos_id = sos_id
        self.src_vocab_size = src_vocab_size
        self.tgt_vocab_size = tgt_vocab_size
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.max_len = max_len

        self.encoder = EncoderRNN(src_vocab_size, max_len - 1, hidden_size, 0,
                                  enc_dropout, enc_n_layers, True, 'gru',
                                  False, None)
        self.decoder = DecoderRNN(
            tgt_vocab_size, max_len - 1,
            hidden_size * 2 if dec_bidirectional else hidden_size, sos_id,
            eou_id, dec_n_layers, 'gru', dec_bidirectional, 0, dec_dropout,
            True)
        # self.beam_decoder = TopKDecoder(self.decoder, beam_size)
        self.seq2seq = Seq2seq(self.encoder, self.decoder)

    def sample(self, src, tgt, TF=0.5):
        sentences, probabilities, hiddens = self.seq2seq(
            src, target_variable=tgt, teacher_forcing_ratio=TF, sample=True)
        #print("dgsh")
        return sentences, probabilities, hiddens

    def forward(self, src, tgt, hack=False):
        src = src.t()
        tgt = tgt.t()

        outputs, _, meta_data = self.seq2seq(
            src,
            target_variable=tgt,
            teacher_forcing_ratio=self.teacher_forcing_ratio)

        batch_size = outputs[0].size(0)
        start_tokens = torch.zeros(batch_size,
                                   self.tgt_vocab_size,
                                   device=outputs[0].device)
        start_tokens[:, self.sos_id] = 1

        outputs = [start_tokens] + outputs
        outputs = torch.stack(outputs)
        if hack == True:
            return outputs, meta_data
        return outputs

        # NOTICE THAT DISCOUNT FACTOR is 1
    def compute_reinforce_loss(self, rewards, probabilities):
        rewards = rewards.to(DEVICE)
        probabilities = probabilities.to(DEVICE)
        sentences_level_reward = torch.mean(rewards, 1)
        R_s_w = rewards

        sent_len = rewards.size(1)
        J = 0
        for k in range(sent_len):
            R_k = torch.sum(R_s_w[:, k:], 1)
            prob = probabilities[:, k]
            J = (J + R_k * prob) * (DISCOUNT_FACTOR)

        loss = -torch.mean(J)
        return loss

    def try_get_state_dicts(self,
                            directory='./',
                            prefix='generator_checkpoint',
                            postfix='.pth.tar'):
        files = os.listdir(directory)
        files = [f for f in files if f.startswith(prefix)]
        files = [f for f in files if f.endswith(postfix)]

        epoch_nums = []
        for file in files:
            number = file[len(prefix):-len(postfix)]
            try:
                epoch_nums.append(int(number))
            except:
                pass

        if len(epoch_nums) < 2:
            return None

        last_complete_epoch = sorted(epoch_nums)[-2]
        filename = prefix + str(last_complete_epoch) + postfix

        data = torch.load(filename)
        return data

    def train_generator_MLE_batch(self, context, reply, optimizer, pad_id):
        context = context.t()
        reply = reply.t()
        loss_func = torch.nn.NLLLoss(ignore_index=pad_id)  # TO DEVICE?
        output = self.forward(context, reply)
        pred_dist = output[1:].view(-1, self.tgt_vocab_size)
        tgt_tokens = reply[1:].contiguous().view(-1)
        loss = loss_func(pred_dist, tgt_tokens)

        # Backpropagate loss
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.parameters(),
                                 10)  # might be something to check
        optimizer.step()

    def train_generator_MLE(self, optimizer, data_loader, vld_data_loader,
                            epochs, device):

        pad_id = TEXT.vocab.stoi['<pad>']

        loss_func = torch.nn.NLLLoss(ignore_index=pad_id)
        loss_func.to(device)

        start_epoch = 0
        # saved_data = try_get_state_dicts()
        # if saved_data is not None:
        #     start_epoch = saved_data['epoch']
        #     self.load_state_dict(saved_data['state_dict'])
        #     optimizer.load_state_dict(saved_data['optimizer'])

        loss_per_epoch = []
        for epoch in range(start_epoch, epochs):
            print('epoch %d : ' % (epoch + 1))

            total_loss = 0
            losses = []
            for (iters, result) in enumerate(data_loader):
                optimizer.zero_grad()
                context = result.text
                reply = result.headlines
                context = context.to(device)
                reply = reply.to(device)

                output = self.forward(context, reply)

                #print(output.size())
                # Compute loss
                pred_dist = output[1:].view(-1, self.tgt_vocab_size)
                tgt_tokens = reply[1:].contiguous().view(-1)

                loss = loss_func(pred_dist, tgt_tokens)

                # Backpropagate loss
                loss.backward()
                clip_grad_norm_(self.parameters(), 10)
                optimizer.step()
                total_loss += loss.data.item()
                losses.append(loss)

                #print(total_loss)

                # Print updates
                if iters % 25 == 0 and iters != 0:
                    print('[Epoch {} iter {}] loss: {}'.format(
                        epoch, iter, total_loss / 25))
                    total_loss = 0
                    torch.save(
                        {
                            'epoch': epoch + 1,
                            'state_dict': self.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'loss': losses,
                        }, 'generator_checkpoint12{}.pth.tar'.format(epoch))

            loss_per_epoch.append(total_loss)
        torch.save(loss_per_epoch, "generator_final_loss.pth.tar")
        return losses

    def monte_carlo(self, dis, context, reply, hiddens, num_samples):
        """
        Samples the network using a batch of source input sequence. Passes these inputs
        through the decoder and instead of taking the top1 (like in forward), sample
        using the distribution over the vocabulary
        Inputs: start of sequence, maximum sample sequence length and num of samples
        Outputs: samples
        samples: num_samples x max_seq_length (a sampled sequence in each row)
        Inputs: dialogue context (and maximum sample sequence length
        Outputs: samples
            - samples: batch_size x reply_length x num_samples x max_seq_length"""

        # Initialize sample
        batch_size = reply.size(0)
        vocab_size = self.decoder.output_size
        # samples_prob = torch.zeros(batch_size, self.max_len)
        encoder_output, _ = self.encoder(context)
        rewards = torch.zeros(reply.size(1), num_samples, batch_size)
        function = F.log_softmax
        reply = reply.to(DEVICE)
        #print(reply.size())
        for t in range(reply.size(1)):
            # Hidden state from orignal generated sequence until t
            for n in range(num_samples):
                samples = reply.clone()
                hidden = hiddens[t].to(DEVICE)
                output = reply[:, t].to(DEVICE)
                # samples_prob[:,0] = torch.ones(output.size())
                # Pass through decoder and sample from resulting vocab distribution
                for next_t in range(t + 1, samples.size(1)):
                    decoder_output, hidden, step_attn = self.decoder.forward_step(
                        output.reshape(-1, 1).long(),
                        hidden,
                        encoder_output,
                        function=function)
                    # Sample token for entire batch from predicted vocab distribution
                    decoder_output = decoder_output.reshape(
                        batch_size, self.tgt_vocab_size).detach()
                    batch_token_sample = torch.multinomial(
                        torch.exp(decoder_output), 1).view(-1)
                    # prob = torch.exp(decoder_output)[np.arange(batch_size), batch_token_sample]
                    # samples_prob[:, next_t] = prob
                    samples[:, next_t] = batch_token_sample
                    output = batch_token_sample
                reward = dis.batchClassify(
                    samples.long().to(DEVICE),
                    context.long().to(DEVICE)).detach()  ## FIX CONTENT

                rewards[t, n, :] = reward

        reward_per_word = torch.mean(rewards, dim=1).permute(1, 0)
        return reward_per_word
Exemplo n.º 6
0
class Translate():
    def __init__(self):
        self.data = PrepareData()
        self.dataset = Seq2SeqDataset()
        self.data_loader = DataLoader(dataset=self.dataset,
                                      batch_size=1,
                                      shuffle=True)
        self.lang_1 = data.lang_1
        self.lang_2 = data.lang_2
        self.char2index = data.char2index
        self.index2char = data.index2char

        self.input_size = 100
        self.hidden_size = 64
        self.output_size = 100
        self.learning_rate = 0.01
        self.num_epoch = 500
        self.teacher_forcing = True
        self.use_cuda = torch.cuda.is_available()
        self.device = 'cuda:0' if self.use_cuda else 'cpu'

        self.encoder = EncoderRNN(input_size=self.input_size, hidden_size=self.hidden_size)
        self.decoder = DecoderRNN(output_size=self.output_size, hidden_size=self.hidden_size)
        self.attn_decoder = AttnDecoder(self.hidden_size, self.output_size)

        if use_cuda:
            self.encoder = self.encoder.to(self.device)
            self.decoder = self.decoder.to(self.device)

        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=self.learning_rate)
        self.decoder_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=self.learning_rate)

        self.loss_function = nn.NLLLoss()

    def create_variable(self, tensor):
        return Variable(tensor.to(self.device))

    def convert2ind(self, sent):
        inds = [self.char2index[w] for w in sent]
        return self.create_variable(torch.LongTensor([inds]))

    # train on single sentence -- return loss + decoder_outputs
    def step(self, input_sent, target_tensor):

        input_tensor = self.convert2ind(list(input_sent[0]))
        target_tensor = self.convert2ind(list(target_tensor[0]))
        target_tensor = target_tensor.squeeze(0)
        clip = 5.0
        loss = 0

        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        target_length = target_tensor.size()[0]

        encoder_hidden = self.encoder(input_tensor)

        decoder_input = self.create_variable(torch.LongTensor([[SOS]])).to(self.device)
        decoder_hidden = encoder_hidden

        decoder_outputs = []

        # scheduled sampling
        for i in range(target_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            decoder_output = decoder_output.squeeze(0)
            model_output = torch.max(decoder_output, 1)[1].unsqueeze(0)
            decoder_input = model_output if random.random() > 0.5 else target_tensor[i].unsqueeze(0).unsqueeze(0)
            target = target_tensor[i].unsqueeze(0)
            loss += self.loss_function(decoder_output, target)
            decoder_outputs.append(model_output.cpu().numpy()[0][0])

        loss.backward()
        torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip)

        self.decoder_optimizer.step()
        self.encoder_optimizer.step()

        return loss.data[0], decoder_outputs

    def train(self):
        for j in range(self.num_epoch):
            for i, (x_data, y_data) in enumerate(self.data_loader):
                loss, result = self.step(x_data, y_data)
            _, x = self.step(['Be fair.'], ['Sois équitable !'])

            print('Epoch' , j)
            print(x)
            print('-------> ', self.convert2ind('Sois équitable !').cpu().numpy()[0])
Exemplo n.º 7
0
 def __init__(self, word_size_encoder, emb_dim, hidden_size,
              word_size_decoder, vector_size):
     super(seq2seq, self).__init__()
     self.encoder = Encoder(vector_size, word_size_encoder, emb_dim,
                            hidden_size)
     self.decoder = DecoderRNN(hidden_size, word_size_decoder, vector_size)
Exemplo n.º 8
0
class Seq2Seq(nn.Module):
    def __init__(self):
        super(Seq2Seq, self).__init__()

        full_text, tokens = self.read_data()
        dataset = Seq2SeqDataset()
        self.dataloader = DataLoader(dataset=dataset,
                                     batch_size=1,
                                     shuffle=True)
        self.SOS = '_'
        self.char2index = {}
        self.char2index[self.SOS] = 0
        token_idx = {w: i for i, w in enumerate(tokens)}
        self.char2index.update(token_idx)
        self.index2char = {w[1]: w[0] for w in self.char2index.items()}

        use_cuda = torch.cuda.is_available()
        self.device = torch.device('cuda:0' if use_cuda else 'cpu')

        self.input_size = len(self.char2index)
        self.hidden_size = 64
        self.n_layers = 1
        self.learning_rate = 0.01
        self.n_epoch = 1000

        self.encoder = EncoderRNN(self.input_size, self.hidden_size,
                                  self.n_layers)
        self.decoder = DecoderRNN(self.hidden_size, len(tokens), self.n_layers)

        self.encoder = self.encoder.to(self.device)
        self.decoder = self.decoder.to(self.device)

    def forward(self, input, target):
        input_tensor = self.convert_to_tensor(input)
        target_tensor = self.convert_to_tensor(target)
        decoder_input = target_tensor

        # use teacher forcing. use sheduled sampling. leave it behind
        encoder_hidden = self.encoder(input_tensor)
        decoder_output, (hidden_state,
                         cell_state) = self.decoder(decoder_input,
                                                    encoder_hidden)

        return decoder_output

    # set configuration for single sentence
    def convert_to_tensor(self, sent):
        sent = self.SOS + sent[0] + self.SOS
        seq = self.create_variable(
            torch.LongTensor([[self.char2index[w] for w in list(sent)]]))
        return seq

    def create_variable(self, seq):
        return seq.to(self.device)

    def read_data(self):
        with open('data/data.txt', 'rt', encoding='utf-8') as file_reader:
            full_text = file_reader.read()
        tokens = sorted(set(full_text))
        return full_text, tokens

    def train(self, model, optimizer, loss_func):
        epoch_loss = 0
        optimizer.zero_grad()
        for i, (x_data, y_data) in enumerate(self.dataloader):
            output = model(x_data, y_data)
            output = output.squeeze(0)
            target_tensor = self.convert_to_tensor(y_data).squeeze(0)
            loss = loss_func(output, target_tensor)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
        return epoch_loss

    def epoch_train(self):
        model = Seq2Seq()
        optimizer = torch.optim.SGD(model.parameters(), lr=self.learning_rate)
        loss_func = NLLLoss()

        for i in range(self.n_epoch):
            loss = self.train(model, optimizer, loss_func)
            a = self.eval('G')
            print(loss)

    def eval(self, sent):
        seq = self.convert_to_tensor(sent)
        decoder_hidden = self.encoder(seq)
        decoder_input = torch.LongTensor([[self.char2index[self.SOS]]
                                          ]).to(self.device)
        output_seq, (hidden_state,
                     cell_state) = self.decoder(decoder_input, decoder_hidden)

        for i in range(13):
            output_seq, (hidden_state,
                         cell_state) = self.decoder(decoder_input,
                                                    decoder_hidden)
            decoder_input = torch.max(output_seq, 2)[1]
            print(decoder_input)
        return output_seq, hidden_state, cell_state
Exemplo n.º 9
0
embedding = nn.Embedding(num_embeddings=vocab_size,
                         embedding_dim=embedding_dim,
                         padding_idx=0)

encoder = EncoderRNN(embedding=embedding,
                     vocab_size=vocab_size,
                     embedding_dim=embedding_dim,
                     hidden_dim=hidden_dim,
                     device=device,
                     num_layer=num_layer,
                     dropout=dropout).to(device)

decoder = DecoderRNN(embedding=embedding,
                     attn_model=attn,
                     vocab_size=vocab_size,
                     embedding_dim=embedding_dim,
                     hidden_dim=hidden_dim,
                     device=device,
                     num_layer=num_layer,
                     dropout=dropout).to(device)

textCNN = textCNN(seq_length=MAX_LEN,
                  embedding_size=embedding_dim,
                  num_labels=2,
                  embedding=embedding,
                  filter_sizes=[3, 4, 5],
                  drop_out_rate=0.5,
                  num_feature_maps=500).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)

decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
Exemplo n.º 10
0
    def __init__(self, embedder, num_highways, num_lstm, hidden_size, dropout,
                 max_len, vocab_size):
        """
        Create a BiDAF model. The input is a tensor of indices, or a tuple of
        same. The outputs are start and end log probability vectors..

        Overall model, assuming no batches:
            1. The passage and question are encoded independently using a
            shared set of embeddings, highway layers and a bidirectional
            LSTM layer.
            2. The passage and question are combined into an attention matrix.
            3. The attention matrix is applied to the question, to get a
            question-in-passage matrix, with one row per token in the passage.
            4. The same attention matrix is applied to the passage, to get a
            passage-in-question vector, which is then tiled to get one row per
            token in the passage.
            5. The resulting matrices are concatenated with the passage, and
            with their product with the passage.
            6. This is then passed through a stack of bidirectional LSTMs.
            7. The results is projected down to 1 dimension, to get the start
            logits.
            8. This is also used as attention, and combined with the LSTM stack
            inputs and outputs, and passed through a final LSTM.
            9. The output is again concatenated with step 5, and projected down
            to 1 dimension, to get the end logits.
            10. A log-softmax is then applied to the logits.

        Parameters:
            :param: embedder (Module): the module in that will embed the
            passage and question
            :param: num_highways (int): the number of highway layers to use
            :param: num_lstm (int): the number of LSTM layers to use
            :param: hidden_size (int): The size of the hidden layers;
            effectively doubled for bidirectional LSTMs
            :param: dropout (float,>=0 or None) Dropout probability

        Variables/sub-modules:
            embedder: the embeddings
            highways: the highway layers
            seq_encoder: the LSTM used after the highway layers to get the
            passage and question representations
            attention: the module used to get the attention matrix
            extractor: the stack of LSTM following attention
            end_encoder: the final LSTM, used to get the end logits
            start_projection: the projection to get the start logits
            end_projection: the projection to get the end logits

        Input:
            :param: passage: features sent to embedder for the passages
            :param: p_lengths: vector containing the passage lengths
            :param: question: features sent to embedder for the questions
            :param: q_lengths: vector containing the question lengths

        Return:
            :return: start_log_probs: (batch, passage_size) float tensor
            containing the log probabilities of the start points
            :return: end_log_probs: (batch, passage_size) float tensor
            containing the log probabilities of the end points
        """
        super(BidafModel, self).__init__()
        self.hidden_size = hidden_size
        self.bidir_hidden_size = 2 * hidden_size
        self.embedder = embedder
        self.highways = Highways(embedder.output_dim, num_highways)
        self.seq_encoder = nn.LSTM(embedder.output_dim,
                                   hidden_size,
                                   num_layers=1,
                                   batch_first=True,
                                   dropout=0,
                                   bidirectional=True)
        self.extractor = nn.LSTM(4 * self.bidir_hidden_size,
                                 hidden_size,
                                 num_layers=num_lstm,
                                 batch_first=True,
                                 dropout=0,
                                 bidirectional=True)
        self.end_encoder = nn.LSTM(7 * self.bidir_hidden_size,
                                   hidden_size,
                                   num_layers=1,
                                   batch_first=True,
                                   dropout=dropout,
                                   bidirectional=True)
        self.attention = AttentionMatrix(self.bidir_hidden_size)

        # Decoder

        self.decoder = DecoderRNN(vocab_size,
                                  max_len,
                                  4 * self.bidir_hidden_size +
                                  self.bidir_hidden_size,
                                  C.SOS_INDEX,
                                  C.EOS_INDEX,
                                  n_layers=1,
                                  rnn_cell='lstm',
                                  bidirectional=True,
                                  input_dropout_p=0.2,
                                  dropout_p=0.2,
                                  use_attention=True)

        # Second hidden_size is for extractor.
        self.start_projection = nn.Linear(
            4 * self.bidir_hidden_size + self.bidir_hidden_size, 1)
        self.end_projection = nn.Linear(
            4 * self.bidir_hidden_size + self.bidir_hidden_size, 1)

        if dropout and dropout > 0:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = lambda nop: nop
        return