Exemplo n.º 1
0
    def encode_table_header(self, tables):
        # input, ids of table word: (batch_size, max_column_num)
        # encode_output: (max_head_word_num, batch_size, max_column_num, hidden_size)

        # (batch_size, max_column_num, max_head_word_num)
        # table_head_mask: (batch_size, max_column_num)
        # table_col_lens: (batch_size, max_column_num)
        table_head_wids, table_col_lens = WikiSqlBatch.get_table_header_input_tensor(tables,
                                                                                     self.vocab.source,
                                                                                     cuda=self.args.cuda)

        # hack: pack_padded_sequence requires seq length to be greater than 1
        for tbl in table_col_lens:
            for i in range(len(tbl)):
                if tbl[i] == 0: tbl[i] = 1

        table_header_mask = WikiSqlBatch.get_table_header_mask(tables, cuda=self.args.cuda)

        # (batch_size, max_column_num, max_head_word_num, word_embed_size)
        table_head_word_embeds = self.src_embed(table_head_wids.view(-1)).view(list(table_head_wids.size()) + [self.src_embed.embedding_dim])

        batch_size = table_head_word_embeds.size(0)
        max_col_num = table_head_word_embeds.size(1)
        max_col_word_num = table_head_word_embeds.size(2)

        # (batch_size * max_column_num, max_head_word_num, word_embed_size)
        table_head_word_embeds_flatten = table_head_word_embeds.view(batch_size * max_col_num,
                                                                     max_col_word_num, -1)
        table_col_lens_flatten = list(chain.from_iterable(table_col_lens))
        sorted_col_ids = sorted(list(range(len(table_col_lens_flatten))), key=lambda x: -table_col_lens_flatten[x])
        sorted_table_col_lens_flatten = [table_col_lens_flatten[i] for i in sorted_col_ids]

        col_old_pos_map = [-1] * len(sorted_col_ids)
        for new_pos, old_pos in enumerate(sorted_col_ids):
            col_old_pos_map[old_pos] = new_pos

        # (batch_size * max_column_num, max_head_word_num, word_embed_size)
        sorted_table_head_word_embeds = table_head_word_embeds_flatten[sorted_col_ids, :, :]

        packed_table_head_word_embeds = pack_padded_sequence(sorted_table_head_word_embeds, sorted_table_col_lens_flatten, batch_first=True)

        # column_word_encodings: (batch_size * max_column_num, max_head_word_num, hidden_size)
        column_word_encodings, (table_header_encoding, table_head_last_cell) = self.table_header_lstm(packed_table_head_word_embeds)
        column_word_encodings, _ = pad_packed_sequence(column_word_encodings, batch_first=True)

        # (batch_size * max_column_num, max_head_word_num, hidden_size)
        column_word_encodings = column_word_encodings[col_old_pos_map]
        # (batch_size, max_column_num, max_head_word_num, hidden_size)
        column_word_encodings = column_word_encodings.view(batch_size, max_col_num, max_col_word_num, -1)

        # (batch_size, hidden_size * 2)
        table_header_encoding = torch.cat([table_header_encoding[0], table_header_encoding[1]], -1)
        # table_head_last_cell = torch.cat([table_head_last_cell[0], table_head_last_cell[1]], -1)

        # same
        table_header_encoding = table_header_encoding[col_old_pos_map]
        # (batch_size, max_column_num, hidden_size)
        table_header_encoding = table_header_encoding.view(batch_size, max_col_num, -1)

        return column_word_encodings, table_header_encoding, table_header_mask
Exemplo n.º 2
0
    def forward(self, sentences, sentences_len, hidden):
        sentences_len = sentences_len.cpu().data.numpy()

        idx = np.argsort(sentences_len).tolist()[::-1]
        ridx = np.argsort(idx).tolist()

        sentences = sentences[idx, :]
        sentences_len = sentences_len[idx, ]
        embedding = self.embedding(sentences)
        embedding = nn.Dropout(0.1)(embedding)

        packed_embedding = pack_padded_sequence(embedding, sentences_len, batch_first=True)
        packed_rnn_feature, hidden = self.rnn_feature(packed_embedding, hidden)
        sentence_feature, _ = pad_packed_sequence(packed_rnn_feature, batch_first=True)

        idx = Variable(LongTensor(sentences_len - 1))
        idx = idx.view(-1, 1).expand(sentence_feature.size(0), sentence_feature.size(2)).unsqueeze(1)
        if sentence_feature.is_cuda:
            idx = idx.cuda()
        sentence_feature = sentence_feature.gather(1, idx).squeeze()

        sentence_feature = sentence_feature[ridx, :]
        sentences_len = sentences_len[ridx, ]

        logits = self.classifier(sentence_feature)
        pred = F.log_softmax(logits, dim=0)
        return pred
    def forward(self, question,length):
        length = list(length.data.cpu().numpy())
        
        
        emb = self.drop(self.encoder(question))
        emb = self.tanh(emb)

        hidden = self.init_hidden(len(length))
        seqs = trnn.pack_padded_sequence(emb, length, batch_first=True)

        seqs, hidden = self.rnn(seqs, hidden)
        h,_ = trnn.pad_packed_sequence(seqs, batch_first=True)

        #attention
        weights = self.softmax(self.att2(torch.transpose(h, 1, 2)).squeeze(1)).unsqueeze(-1)
        weights = weights.expand_as(h)
        
        bilstmout = torch.sum(h*weights, 1).squeeze(1)


        #bilstmout = torch.cat([hidden[0][0],hidden[0][1]],-1)


        fc1fea = self.fc1(bilstmout)

        return fc1fea
Exemplo n.º 4
0
    def postprocess_sequence(self, X):
        """Embed (variable-length) sequences

        Parameters
        ----------
        X : list
            List of input sequences

        Returns
        -------
        fX : numpy array
            Batch of sequence embeddings.
        """

        lengths = torch.tensor([len(x) for x in X])
        sorted_lengths, sort = torch.sort(lengths, descending=True)
        _, unsort = torch.sort(sort)

        sequences = [torch.tensor(X[i],
                                  dtype=torch.float32,
                                  device=self.device) for i in sort]
        padded = pad_sequence(sequences, batch_first=True, padding_value=0)
        packed = pack_padded_sequence(padded, sorted_lengths,
                                      batch_first=True)

        cpu = torch.device('cpu')
        fX = self.model(packed).detach().to(cpu).numpy()
        return fX[unsort]
 def forward(self, input, *args):
     args, seq_lengths = args[:-1], args[-1]
     input = rnn_utils.pack_padded_sequence(input, seq_lengths, self.batch_first)
     rets = self.model(input, *args)
     ret, rets = rets[0], rets[1:]
     ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first)
     return tuple([ret] + list(rets))
    def test_forward_pulls_out_correct_tensor_with_unsorted_batches(self):
        lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True)
        encoder = PytorchSeq2VecWrapper(lstm)

        input_tensor = torch.rand([5, 7, 3])
        input_tensor[0, 3:, :] = 0
        input_tensor[1, 4:, :] = 0
        input_tensor[2, 2:, :] = 0
        input_tensor[3, 6:, :] = 0
        mask = torch.ones(5, 7)
        mask[0, 3:] = 0
        mask[1, 4:] = 0
        mask[2, 2:] = 0
        mask[3, 6:] = 0

        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor,
                                                                                              sequence_lengths)
        packed_sequence = pack_padded_sequence(sorted_inputs,
                                               sorted_sequence_lengths.tolist(),
                                               batch_first=True)
        _, state = lstm(packed_sequence)
        # Transpose output state, extract the last forward and backward states and
        # reshape to be of dimension (batch_size, 2 * hidden_size).
        sorted_transposed_state = state[0].transpose(0, 1).index_select(0, restoration_indices)
        reshaped_state = sorted_transposed_state[:, -2:, :].contiguous()
        explicitly_concatenated_state = torch.cat([reshaped_state[:, 0, :].squeeze(1),
                                                   reshaped_state[:, 1, :].squeeze(1)], -1)
        encoder_output = encoder(input_tensor, mask)
        assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
    def test_forward_pulls_out_correct_tensor_for_unsorted_batches(self):
        lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True)
        encoder = PytorchSeq2SeqWrapper(lstm)
        input_tensor = torch.rand([5, 7, 3])
        input_tensor[0, 3:, :] = 0
        input_tensor[1, 4:, :] = 0
        input_tensor[2, 2:, :] = 0
        input_tensor[3, 6:, :] = 0
        mask = torch.ones(5, 7)
        mask[0, 3:] = 0
        mask[1, 4:] = 0
        mask[2, 2:] = 0
        mask[3, 6:] = 0

        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor,
                                                                                              sequence_lengths)
        packed_sequence = pack_padded_sequence(sorted_inputs,
                                               sorted_sequence_lengths.data.tolist(),
                                               batch_first=True)
        lstm_output, _ = lstm(packed_sequence)
        encoder_output = encoder(input_tensor, mask)
        lstm_tensor, _ = pad_packed_sequence(lstm_output, batch_first=True)
        assert_almost_equal(encoder_output.data.numpy(),
                            lstm_tensor.index_select(0, restoration_indices).data.numpy())
Exemplo n.º 8
0
    def forward(self, xs):
        bsz = len(xs)

        # embed input tokens
        xes = F.dropout(self.lt(xs), p=self.dropout, training=self.training)
        x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
        xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True)

        zeros = self.zeros(xs)
        if zeros.size(1) != bsz:
            zeros.resize_(self.layers * self.dirs, bsz, self.hsz).fill_(0)
        h0 = Variable(zeros, requires_grad=False)

        if type(self.rnn) == nn.LSTM:
            encoder_output_packed, hidden = self.rnn(xes_packed, (h0, h0))
            # take elementwise max between forward and backward hidden states
            hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).max(1)[0],
                      hidden[1].view(-1, self.dirs, bsz, self.hsz).max(1)[0])
        else:
            encoder_output_packed, hidden = self.rnn(xes_packed, h0)

            # take elementwise max between forward and backward hidden states
            hidden = hidden.view(-1, self.dirs, bsz, self.hsz).max(1)[0]
        encoder_output, _ = pad_packed_sequence(encoder_output_packed,
                                                batch_first=True)
        return encoder_output, hidden
Exemplo n.º 9
0
    def encode(self, src_sents_var: torch.Tensor, src_sent_lens: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Use a GRU/LSTM to encode source sentences into hidden states

        Args:
            src_sents: list of source sentence tokens

        Returns:
            src_encodings: hidden states of tokens in source sentences, this could be a variable
                with shape (batch_size, source_sentence_length, encoding_dim), or in orther formats
            decoder_init_state: decoder GRU/LSTM's initial state, computed from source encodings
        """

        # (src_sent_len, batch_size, embed_size)
        src_word_embeds = self.src_embed(src_sents_var)
        packed_src_embed = pack_padded_sequence(src_word_embeds, src_sent_lens)

        # src_encodings: (src_sent_len, batch_size, hidden_size * 2)
        src_encodings, (last_state, last_cell) = self.encoder_lstm(packed_src_embed)
        src_encodings, _ = pad_packed_sequence(src_encodings)

        # (batch_size, src_sent_len, hidden_size * 2)
        src_encodings = src_encodings.permute(1, 0, 2)

        dec_init_cell = self.decoder_cell_init(torch.cat([last_cell[0], last_cell[1]], dim=1))
        dec_init_state = torch.tanh(dec_init_cell)

        return src_encodings, (dec_init_state, dec_init_cell)
Exemplo n.º 10
0
    def forward(self, embs, lengths):
        """
        This is the heart of the model. This function, defines how the data
        passes through the network.
        Args:
            embs (): word embeddings
            lengths (): the lengths of each sentence

        Returns: the logits for each class

        """
        # pack the batch
        packed = pack_padded_sequence(embs, list(lengths.data),
                                      batch_first=True)

        out_packed, _ = self.rnn(packed)

        # unpack output - no need if we are going to use only the last outputs
        outputs, _ = pad_packed_sequence(out_packed, batch_first=True)

        # get the outputs from the last *non-masked* timestep for each sentence
        last_outputs = self.last_timestep(outputs, lengths,
                                          self.rnn.bidirectional)

        # apply dropout to the outputs of the RNN
        last_outputs = self.drop_rnn(last_outputs)

        return outputs, last_outputs
Exemplo n.º 11
0
    def forward(self, vocab):
        with torch.no_grad():
            batch_shape = vocab['sentence'].shape
            s_embedding = self.embedding(vocab['sentence'].cuda())
            a_embedding = self.embedding(vocab['aspect'].cuda())

            packed_s = pack_padded_sequence(s_embedding, vocab['sent_len'], batch_first=True)

        out_s, (h_s, c1) = self.lstm_s(packed_s) # packed output
        out_a, (h_a, c2) = self.lstm_a(a_embedding)

        with torch.no_grad():
            unpacked_out_s, _ = pad_packed_sequence(out_s, batch_first=True)

        # Pair-wise interaction matrix
        I_matrix = torch.bmm(unpacked_out_s, out_a.permute(0,2,1))

        # Column-wise softmax
        a2s_attn = F.softmax(I_matrix, dim=1)

        # Row-wise softmax => Column-wise average => aspect attention
        s2a_attn = F.softmax(I_matrix, dim=2)
        a_attn = torch.mean(s2a_attn, dim=1)

        # Final sentence attn => weighted sum of each individual a2s_attn
        s_attn = torch.bmm(a2s_attn, a_attn.unsqueeze(-1))

        final_rep = torch.bmm(unpacked_out_s.permute(0,2,1), s_attn).squeeze(-1)
        pred = self.fc(final_rep)
        return pred
Exemplo n.º 12
0
    def encode(self, src_sents_var, src_sents_len):
        """Encode the input natural language utterance

        Args:
            src_sents_var: a variable of shape (src_sent_len, batch_size), representing word ids of the input
            src_sents_len: a list of lengths of input source sentences, sorted by descending order

        Returns:
            src_encodings: source encodings of shape (batch_size, src_sent_len, hidden_size * 2)
            last_state, last_cell: the last hidden state and cell state of the encoder,
                                   of shape (batch_size, hidden_size)
        """

        # (tgt_query_len, batch_size, embed_size)
        # apply word dropout
        if self.training and self.args.word_dropout:
            mask = Variable(self.new_tensor(src_sents_var.size()).fill_(1. - self.args.word_dropout).bernoulli().long())
            src_sents_var = src_sents_var * mask + (1 - mask) * self.vocab.source.unk_id

        src_token_embed = self.src_embed(src_sents_var)
        packed_src_token_embed = pack_padded_sequence(src_token_embed, src_sents_len)

        # src_encodings: (tgt_query_len, batch_size, hidden_size)
        src_encodings, (last_state, last_cell) = self.encoder_lstm(packed_src_token_embed)
        src_encodings, _ = pad_packed_sequence(src_encodings)
        # src_encodings: (batch_size, tgt_query_len, hidden_size)
        src_encodings = src_encodings.permute(1, 0, 2)

        # (batch_size, hidden_size * 2)
        last_state = torch.cat([last_state[0], last_state[1]], 1)
        last_cell = torch.cat([last_cell[0], last_cell[1]], 1)

        return src_encodings, (last_state, last_cell)
Exemplo n.º 13
0
    def forward(self, x):
        """Receives a Variable of indices (n_timesteps, n_samples) and
        returns their recurrent representations."""
        # sort the batch by decreasing length of sequences
        # oidxs: to recover original order
        # sidxs: idxs to sort the batch
        # slens: lengths in sorted order for pack_padded_sequence()
        oidxs, sidxs, slens, mask = sort_batch(x)

        # Fetch embeddings for the sorted batch
        embs = self.emb(x[:, sidxs])

        if self.dropout_emb > 0:
            embs = self.do_emb(embs)

        # Pack and encode
        packed_emb = pack_padded_sequence(embs, slens)
        packed_hs, h_t = self.enc(packed_emb)

        # Get hidden states and revert the order
        hs = pad_packed_sequence(packed_hs)[0][:, oidxs]

        if self.dropout_ctx > 0:
            hs = self.do_ctx(hs)

        return hs, mask
    def test_forward_pulls_out_correct_tensor_with_sequence_lengths(self):
        lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True)
        encoder = PytorchSeq2VecWrapper(lstm)

        tensor = torch.rand([5, 7, 3])
        tensor[1, 6:, :] = 0
        tensor[2, 4:, :] = 0
        tensor[3, 2:, :] = 0
        tensor[4, 1:, :] = 0
        mask = torch.ones(5, 7)
        mask[1, 6:] = 0
        mask[2, 4:] = 0
        mask[3, 2:] = 0
        mask[4, 1:] = 0

        input_tensor = Variable(tensor)
        mask = Variable(mask)
        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        packed_sequence = pack_padded_sequence(input_tensor, list(sequence_lengths.data), batch_first=True)
        _, state = lstm(packed_sequence)
        # Transpose output state, extract the last forward and backward states and
        # reshape to be of dimension (batch_size, 2 * hidden_size).
        reshaped_state = state[0].transpose(0, 1)[:, -2:, :].contiguous()
        explicitly_concatenated_state = torch.cat([reshaped_state[:, 0, :].squeeze(1),
                                                   reshaped_state[:, 1, :].squeeze(1)], -1)
        encoder_output = encoder(input_tensor, mask)
        assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
Exemplo n.º 15
0
    def test_augmented_lstm_computes_same_function_as_pytorch_lstm(self):
        augmented_lstm = AugmentedLstm(10, 11)
        pytorch_lstm = LSTM(10, 11, num_layers=1, batch_first=True)
        # Initialize all weights to be == 1.
        initializer = InitializerApplicator([(".*", lambda tensor: torch.nn.init.constant_(tensor, 1.))])
        initializer(augmented_lstm)
        initializer(pytorch_lstm)

        initial_state = torch.zeros([1, 5, 11])
        initial_memory = torch.zeros([1, 5, 11])

        # Use bigger numbers to avoid floating point instability.
        sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor * 5., self.sequence_lengths)
        lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True)

        augmented_output, augmented_state = augmented_lstm(lstm_input, (initial_state, initial_memory))
        pytorch_output, pytorch_state = pytorch_lstm(lstm_input, (initial_state, initial_memory))
        pytorch_output_sequence, _ = pad_packed_sequence(pytorch_output, batch_first=True)
        augmented_output_sequence, _ = pad_packed_sequence(augmented_output, batch_first=True)

        numpy.testing.assert_array_almost_equal(pytorch_output_sequence.data.numpy(),
                                                augmented_output_sequence.data.numpy(), decimal=4)
        numpy.testing.assert_array_almost_equal(pytorch_state[0].data.numpy(),
                                                augmented_state[0].data.numpy(), decimal=4)
        numpy.testing.assert_array_almost_equal(pytorch_state[1].data.numpy(),
                                                augmented_state[1].data.numpy(), decimal=4)
Exemplo n.º 16
0
 def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
     """
         input:
             word_inputs: (batch_size, sent_len)
             word_seq_lengths: list of batch_size, (batch_size,1)
             char_inputs: (batch_size*sent_len, word_length)
             char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
             char_seq_recover: variable which records the char order information, used to recover char order
         output:
             Variable(batch_size, sent_len, hidden_dim)
     """
     word_represent = self.wordrep(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
     ## word_embs (batch_size, seq_len, embed_size)
     if self.word_feature_extractor == "CNN":
         word_in = F.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous()
         for idx in range(self.cnn_layer):
             if idx == 0:
                 cnn_feature = F.relu(self.cnn_list[idx](word_in))
             else:
                 cnn_feature = F.relu(self.cnn_list[idx](cnn_feature))
             cnn_feature = self.cnn_drop_list[idx](cnn_feature)
             cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature)
         feature_out = cnn_feature.transpose(2,1).contiguous()
     else:
         packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True)
         hidden = None
         lstm_out, hidden = self.lstm(packed_words, hidden)
         lstm_out, _ = pad_packed_sequence(lstm_out)
         ## lstm_out (seq_len, seq_len, hidden_size)
         feature_out = self.droplstm(lstm_out.transpose(1,0))
     ## feature_out (batch_size, seq_len, hidden_size)
     outputs = self.hidden2tag(feature_out)
     return outputs
Exemplo n.º 17
0
    def encode(self, indices, lengths, noise):
        embeddings = self.embedding(indices)
        packed_embeddings = pack_padded_sequence(input=embeddings,
                                                 lengths=lengths,
                                                 batch_first=True)

        # Encode
        packed_output, state = self.encoder(packed_embeddings)

        hidden, cell = state
        # batch_size x nhidden
        hidden = hidden[-1]  # get hidden state of last layer of encoder

        # normalize to unit ball (l2 norm of 1) - p=2, dim=1
        norms = torch.norm(hidden, 2, 1)
        
        # For older versions of PyTorch use:
        hidden = torch.div(hidden, norms.expand_as(hidden))
        # For newest version of PyTorch (as of 8/25) use this:
        # hidden = torch.div(hidden, norms.unsqueeze(1).expand_as(hidden))

        if noise and self.noise_radius > 0:
            gauss_noise = torch.normal(means=torch.zeros(hidden.size()),
                                       std=self.noise_radius)
            hidden = hidden + to_gpu(self.gpu, Variable(gauss_noise))

        return hidden
Exemplo n.º 18
0
 def forward(self, features, captions, lengths):
     """Decode image feature vectors and generates captions."""
     embeddings = self.embed(captions)
     embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
     packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
     hiddens, _ = self.lstm(packed)
     outputs = self.linear(hiddens[0])
     return outputs
Exemplo n.º 19
0
    def _run_rnns(self, inputs, structures, lengths):
        '''
            Run desired rnns
        '''
        for rnn, structure in zip(self.rnns, [structures]):
            if isinstance(rnn, ChildSumTreeLSTM):
                h_all, h_last = rnn(inputs, structure)
            elif isinstance(rnn, LSTM):
                packed = pack_padded_sequence(inputs, list(lengths.data), batch_first=True)
                h_all, (h_last, c_last) = rnn(packed)
                h_all, _ = pad_packed_sequence(h_all, batch_first=True)
            elif isinstance(rnn, GRU):
                packed = pack_padded_sequence(inputs, list(lengths.data), batch_first=True)
                h_all, h_last = rnn(packed)
                h_all, _ = pad_packed_sequence(h_all, batch_first=True)
            inputs = h_all.squeeze()

        return h_all, h_last
Exemplo n.º 20
0
    def test_variable_length_sequences_run_backward_return_correctly_padded_outputs(self):
        sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor, self.sequence_lengths)
        tensor = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True)
        lstm = AugmentedLstm(10, 11, go_forward=False)
        output, _ = lstm(tensor)
        output_sequence, _ = pad_packed_sequence(output, batch_first=True)

        numpy.testing.assert_array_equal(output_sequence.data[1, 6:, :].numpy(), 0.0)
        numpy.testing.assert_array_equal(output_sequence.data[2, 4:, :].numpy(), 0.0)
        numpy.testing.assert_array_equal(output_sequence.data[3, 3:, :].numpy(), 0.0)
        numpy.testing.assert_array_equal(output_sequence.data[4, 2:, :].numpy(), 0.0)
Exemplo n.º 21
0
    def forward(self, input, seq_lens):
        embedded = self.embedding(input)

        packed = pack_padded_sequence(embedded, seq_lens, batch_first=True)
        output, hidden = self.lstm(packed)

        h, _ = pad_packed_sequence(output, batch_first=True)  # h dim = B x t_k x n
        h = h.contiguous()
        max_h, _ = h.max(dim=1)

        return h, hidden, max_h
 def test_stacked_bidirectional_lstm_completes_forward_pass(self):
     input_tensor = torch.rand(4, 5, 3)
     input_tensor[1, 4:, :] = 0.
     input_tensor[2, 2:, :] = 0.
     input_tensor[3, 1:, :] = 0.
     input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True)
     lstm = StackedBidirectionalLstm(3, 7, 3)
     output, _ = lstm(input_tensor)
     output_sequence, _ = pad_packed_sequence(output, batch_first=True)
     numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0)
     numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0)
     numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
    def forward(self, inputs: PackedSequence,  # pylint: disable=arguments-differ
                # pylint: disable=unused-argument
                initial_state: torch.Tensor = None)-> Tuple[PackedSequence, torch.Tensor]:
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            Currently, this is ignored.

        Returns
        -------
        output_sequence : ``PackedSequence``
            The encoded sequence of shape (batch_size, sequence_length, hidden_size)
        final_states: ``torch.Tensor``
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size).
        """
        inputs, lengths = pad_packed_sequence(inputs, batch_first=True)

        # Kernel takes sequence length first tensors.
        inputs = inputs.transpose(0, 1)

        sequence_length, batch_size, _ = inputs.size()
        accumulator_shape = [self.num_layers, sequence_length + 1, batch_size, self.hidden_size]
        state_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False)
        memory_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False)

        dropout_weights = inputs.data.new().resize_(self.num_layers, batch_size, self.hidden_size).fill_(1.0)
        if self.training:
            # Normalize by 1 - dropout_prob to preserve the output statistics of the layer.
            dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\
                .div_((1 - self.recurrent_dropout_probability))

        dropout_weights = Variable(dropout_weights, requires_grad=False)
        gates = Variable(inputs.data.new().resize_(self.num_layers,
                                                   sequence_length,
                                                   batch_size, 6 * self.hidden_size))

        lengths_variable = Variable(torch.IntTensor(lengths))
        implementation = _AlternatingHighwayLSTMFunction(self.input_size,
                                                         self.hidden_size,
                                                         num_layers=self.num_layers,
                                                         train=self.training)
        output, _ = implementation(inputs, self.weight, self.bias, state_accumulator,
                                   memory_accumulator, dropout_weights, lengths_variable, gates)

        # TODO(Mark): Also return the state here by using index_select with the lengths so we can use
        # it as a Seq2VecEncoder.
        output = output.transpose(0, 1)
        output = pack_padded_sequence(output, lengths, batch_first=True)
        return output, None
 def test_stacked_alternating_lstm_completes_forward_pass(self):
     input_tensor = torch.autograd.Variable(torch.rand(4, 5, 3))
     input_tensor[1, 4:, :] = 0.
     input_tensor[2, 2:, :] = 0.
     input_tensor[3, 1:, :] = 0.
     input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True)
     lstm = StackedAlternatingLstm(3, 7, 3)
     output, _ = lstm(input_tensor)
     output_sequence, _ = pad_packed_sequence(output, batch_first=True)
     numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0)
     numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0)
     numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
Exemplo n.º 25
0
    def forward(self, x, lens):
        B, T = x.shape
        # 获取词嵌入向量
        x = self.embed(x)
        x = self.drop(x)

        x = pack_padded_sequence(x, lens, True)
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, True)
        x = self.drop(x)

        return self.out(x)
    def sort_and_run_forward(self, module, inputs, mask):
        batch_size = mask.size(0)
        sequence_lengths = mask.long().sum(-1)
        sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices = sort_batch_by_length(inputs,
                                                                                                            sequence_lengths)

        packed_sequence_input = pack_padded_sequence(sorted_inputs[:, :, :],
                                                     sorted_sequence_lengths[:].data.tolist(),
                                                     batch_first=True)

        module_output, final_states = module(packed_sequence_input, None)
        return module_output, final_states, restoration_indices
Exemplo n.º 27
0
    def forward(self,  # pylint: disable=arguments-differ
                inputs: PackedSequence,
                initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            A tuple (state, memory) representing the initial hidden state and memory
            of the LSTM. Each tensor has shape (1, batch_size, output_dimension * 2).

        Returns
        -------
        output_sequence : PackedSequence
            The encoded sequence of shape (batch_size, sequence_length, hidden_size * 2)
        final_states: torch.Tensor
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size * 2).
        """
        if not initial_state:
            hidden_states = [None] * len(self.lstm_layers)
        elif initial_state[0].size()[0] != len(self.lstm_layers):
            raise ConfigurationError("Initial states were passed to forward() but the number of "
                                     "initial states does not match the number of layers.")
        else:
            hidden_states = list(zip(initial_state[0].split(1, 0),
                                     initial_state[1].split(1, 0)))

        output_sequence = inputs
        final_h = []
        final_c = []
        for i, state in enumerate(hidden_states):
            forward_layer = getattr(self, 'forward_layer_{}'.format(i))
            backward_layer = getattr(self, 'backward_layer_{}'.format(i))
            # The state is duplicated to mirror the Pytorch API for LSTMs.
            forward_output, final_forward_state = forward_layer(output_sequence, state)
            backward_output, final_backward_state = backward_layer(output_sequence, state)

            forward_output, lengths = pad_packed_sequence(forward_output, batch_first=True)
            backward_output, _ = pad_packed_sequence(backward_output, batch_first=True)

            output_sequence = torch.cat([forward_output, backward_output], -1)
            output_sequence = pack_padded_sequence(output_sequence, lengths, batch_first=True)

            final_h.extend([final_forward_state[0], final_backward_state[0]])
            final_c.extend([final_forward_state[1], final_backward_state[1]])

        final_h = torch.cat(final_h, dim=0)
        final_c = torch.cat(final_c, dim=0)
        final_state_tuple = (final_h, final_c)
        return output_sequence, final_state_tuple
Exemplo n.º 28
0
 def forward(self, input, hidden, no_pack=False):
     emb = self.drop(self.encoder(input))
     # if eval, pack padded sequence (we don't pack during training because
     # we have no padding in our input samples)
     if not self.training and not no_pack:
         emb_lens = [x for x in torch.sum((input > 0).int(), dim=0).data]
         emb_packed = pack_padded_sequence(emb, emb_lens, batch_first=False)
         packed_output, hidden = self.rnn(emb_packed, hidden)
         output, _ = pad_packed_sequence(packed_output, batch_first=False)
     else:
         output, hidden = self.rnn(emb, hidden)
     output = self.drop(output)
     decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
     return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
    def _sort_and_run_forward(self, module, inputs, mask):
        batch_size = mask.size(0)
        num_valid = torch.sum(mask[:, 0]).int().item() # just in case some instances may be of zero length.

        sequence_lengths = mask.long().sum(-1)
        sorted_inputs, sorted_sequence_lengths, restoration, sorting = sort_batch_by_length(inputs, sequence_lengths)

        packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :],
                                                     sorted_sequence_lengths[:num_valid].data.tolist(),
                                                     batch_first=True)
        initial_states = self._get_initial_states(batch_size, num_valid, sorting)
        module_output, final_states = module(packed_sequence_input, initial_states)

        return module_output, final_states, restoration
Exemplo n.º 30
0
 def forward(self, img_feats, captions, lengths):
     embeddings = self.embedding(captions)
     # img_feats是2048维的向量,通过全连接层转为256维的向量,和词向量一样
     img_feats = self.fc(img_feats).unsqueeze(0)
     # 将img_feats看成第一个词的词向量 
     embeddings = t.cat([img_feats, embeddings], 0)
     # PackedSequence
     packed_embeddings = pack_padded_sequence(embeddings, lengths)
     outputs, state = self.rnn(packed_embeddings)
     # lstm的输出作为特征用来分类预测下一个词的序号
     # 因为输入是PackedSequence,所以输出的output也是PackedSequence
     # PackedSequence第一个元素是Variable,第二个元素是batch_sizes,
     # 即batch中每个样本的长度
     pred = self.classifier(outputs[0])
     return pred, state
Exemplo n.º 31
0
    def _reorder_by_length_and_package(
            self, pred_out, molecule_graphs, mol_to_graph_idx,
            num_initial_reactants_in_mol_to_graph_idx, original_syn_trees):

        # Now package everything together!
        # --> compute the order
        PAD_VALUE = settings.PAD_VALUE
        seq_sizes = np.array([p.sequence_choices.size for p in pred_out])
        array_containing_original_indcs = np.argsort(
            seq_sizes)[::-1]  # we need to put the largest sequence first.
        seq_size_with_padding = seq_sizes.max()
        new_seq_sizes = seq_sizes[array_containing_original_indcs]

        # --> the input DoGs can just be stacked together.
        dags_for_input = [
            pred_out[i].dag_for_input for i in array_containing_original_indcs
        ]
        dags_for_input = dags_for_input[0].concatenate(dags_for_input)
        dags_for_input.inplace_from_np_to_torch()

        # We also record where the root molecule for each of these lives in the graphs (last poistion of each DAG)
        final_molecule_indcs = np.cumsum(
            np.bincount(dags_for_input.node_to_graph_id,
                        minlength=dags_for_input.max_num_graphs)) - 1
        final_molecule_indcs = torch.tensor(final_molecule_indcs,
                                            dtype=settings.TORCH_INT)

        # --> the other parts we want to put in PackedSequence or have more clearer indication of where they live
        # inside the other parts
        construction_dags: typing.List[grphs.DirectedGraphAsAdjList] = []
        dags_id_at_index = []
        sequence_action_kinds = []
        sequence_choices = []
        number_edge_choices_including_both_stops = len(mol_to_graph_idx) + 2
        edge_masks = np.full(
            (len(array_containing_original_indcs), seq_size_with_padding,
             number_edge_choices_including_both_stops), PAD_VALUE)
        reactant_masks = np.full(
            (len(array_containing_original_indcs), seq_size_with_padding,
             num_initial_reactants_in_mol_to_graph_idx), PAD_VALUE)

        for new_idx, old_idx in enumerate(array_containing_original_indcs):
            p = pred_out[old_idx]
            p_seq_size = p.sequence_choices.size

            # --> We will deal with the construction DAGS first. The empty DAG is the same for all of them (this is
            # at index 0 and should be None so can be shared)
            num_construction_dags_seen_so_far = len(construction_dags)
            construction_dags.extend(
                filter(lambda x: x is not None, p.dags_at_construction_stages))

            dags_id_with_correct_shift = p.dags_id_at_index
            dags_id_with_correct_shift[dags_id_with_correct_shift !=
                                       0] += num_construction_dags_seen_so_far
            # ^ The DAG ID will get shifted when we concatenate them but not the index for 0 as this is the empty DAG.
            assert dags_id_with_correct_shift.size == p_seq_size
            dags_id_at_index.append(
                np.pad(dags_id_with_correct_shift,
                       (0, seq_size_with_padding - p_seq_size),
                       'constant',
                       constant_values=PAD_VALUE))

            assert p.sequence_action_kinds.size == p_seq_size
            new_seq_action_kinds = np.pad(
                p.sequence_action_kinds,
                (0, seq_size_with_padding - p_seq_size),
                'constant',
                constant_values=PAD_VALUE)
            sequence_action_kinds.append(new_seq_action_kinds)

            assert p.sequence_choices.size == p_seq_size
            sequence_choices.append(
                np.pad(p.sequence_choices,
                       (0, seq_size_with_padding - p_seq_size),
                       'constant',
                       constant_values=PAD_VALUE))

            if p.sequence_masks_for_edge_steps is not None:
                edge_masks[
                    new_idx, new_seq_action_kinds ==
                    EDGE_ADD_STEP_VAL, :] = p.sequence_masks_for_edge_steps
            else:
                assert (new_seq_action_kinds == EDGE_ADD_STEP_VAL).sum() == 0

            reactant_masks[
                new_idx, new_seq_action_kinds ==
                REACTANT_CHOOSE_STEP_VAL, :] = p.sequence_masks_for_reactant_steps

        # --> Put the construction DAGs together
        construction_dags: grphs.DirectedGraphAsAdjList = construction_dags[
            0].concatenate(construction_dags)
        construction_dags.inplace_from_np_to_torch()

        # --> Pack the padded sequences together
        seq_sizes = torch.tensor(new_seq_sizes)

        dags_id_at_index = torch.tensor(np.stack(dags_id_at_index),
                                        dtype=settings.TORCH_INT)
        dags_id_at_index = rnn.pack_padded_sequence(dags_id_at_index,
                                                    seq_sizes,
                                                    batch_first=True)

        sequence_action_kinds = torch.tensor(np.stack(sequence_action_kinds),
                                             dtype=settings.TORCH_INT)
        sequence_action_kinds = rnn.pack_padded_sequence(sequence_action_kinds,
                                                         seq_sizes,
                                                         batch_first=True)

        sequence_choices = torch.tensor(np.stack(sequence_choices),
                                        dtype=settings.TORCH_INT)
        sequence_choices = rnn.pack_padded_sequence(sequence_choices,
                                                    seq_sizes,
                                                    batch_first=True)

        edge_masks = rnn.pack_padded_sequence(torch.tensor(edge_masks,
                                                           dtype=torch.bool),
                                              seq_sizes,
                                              batch_first=True)

        reactant_masks = rnn.pack_padded_sequence(torch.tensor(
            reactant_masks, dtype=torch.bool),
                                                  seq_sizes,
                                                  batch_first=True)

        original_syn_trees = [
            original_syn_trees[i] for i in array_containing_original_indcs
        ]

        return PredOutBatch(dags_for_inputs=dags_for_input, dags_at_construction_stages=construction_dags,
                            molecular_graphs=molecule_graphs, dags_id_at_index=dags_id_at_index,
                            sequence_action_kinds=sequence_action_kinds, sequence_choices=sequence_choices,
                            sequence_masks_for_edge_steps=edge_masks,
                            sequence_masks_for_reactant_steps=reactant_masks,
                            mol_to_graph_idx=mol_to_graph_idx,
                            num_that_are_initial_reactants=num_initial_reactants_in_mol_to_graph_idx,
                            final_molecule_indcs=final_molecule_indcs, syn_trees=original_syn_trees), \
               array_containing_original_indcs
Exemplo n.º 32
0
    def forward(self,
                sentence,
                p_sentence,
                pos_tags,
                lengths,
                target_idx_in,
                region_marks,
                local_roles_voc,
                frames,
                local_roles_mask,
                sent_pred_lemmas_idx,
                dep_tags,
                dep_heads,
                targets,
                P_identification,
                all_l_ids,
                Predicate_link,
                Predicate_Labels_nd,
                Predicate_Labels,
                Chars_in,
                unlabeled_sentence=None,
                p_unlabeled_sentence=None,
                unlabeled_lengths=None,
                test=False,
                cvt_train=False):

        if cvt_train:
            CVT_SRL_Loss = self.CVT_train(unlabeled_sentence,
                                          p_unlabeled_sentence,
                                          unlabeled_lengths)
            return CVT_SRL_Loss
        """
        perform predicate Identificaiton first
        """
        Predicate_Identification_Space = self.Predicate_Id(
            sentence, p_sentence, lengths)

        # +++++++++++++++++++++++
        wrong_l_nums = 0.0
        all_l_nums = 0.0

        right_noNull_predict = 0.0
        noNull_predict = 0.0
        noNUll_truth = 0.0

        PI_labels = np.argmax(
            Predicate_Identification_Space.cpu().data.numpy(), axis=1)
        for predict_l, gold_l in zip(
                PI_labels,
                P_identification.cpu().view(-1).data.numpy()):
            if predict_l > 1 and gold_l != 0:
                noNull_predict += 1
            if gold_l != 0:
                all_l_nums += 1
                if gold_l != 1:
                    noNUll_truth += 1
                    if gold_l == predict_l:
                        right_noNull_predict += 1
            if predict_l != gold_l and gold_l != 0:
                wrong_l_nums += 1
        """
        construct DEP_input
        """

        embeds_SRL = self.word_embeddings_SRL(sentence)
        embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)
        region_marks = self.region_embeddings(region_marks).view(
            self.batch_size, len(sentence[0]), 16)

        embeds_Memory = F.tanh(torch.matmul(embeds_SRL, self.Memory_space))

        SRL_hidden_states = torch.cat((embeds_Memory, region_marks), 2)
        SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states)

        # SRL layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            SRL_hidden_states, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_SRL_base = self.BiLSTM_1(
            embeds_sort, self.hidden_SRL_base)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_0 = hidden_states[unsort_idx]

        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            hidden_states_0, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort.cpu().numpy(),
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_SRL = self.BiLSTM_SRL(
            embeds_sort, self.hidden_SRL)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states = hidden_states[unsort_idx]
        hidden_states = self.hidden_state_dropout_SRL(hidden_states)

        # B * H
        hidden_states_3 = hidden_states
        hidden_states_word = self.dropout_1(
            F.relu(self.Non_Predicate_Proj(hidden_states_3)))
        predicate_embeds = hidden_states_3[
            np.arange(0,
                      hidden_states_3.size()[0]), target_idx_in]
        hidden_states_predicate = self.dropout_2(
            F.relu(self.Predicate_Proj(predicate_embeds)))

        bias_one = torch.ones(
            (self.batch_size, len(sentence[0]), 1)).to(device)
        hidden_states_word = torch.cat(
            (hidden_states_word, Variable(bias_one)), 2)

        bias_one = torch.ones((self.batch_size, 1)).to(device)
        hidden_states_predicate = torch.cat(
            (hidden_states_predicate, Variable(bias_one)), 1)

        left_part = torch.mm(
            hidden_states_word.view(self.batch_size * len(sentence[0]), -1),
            self.W_R)
        left_part = left_part.view(self.batch_size,
                                   len(sentence[0]) * self.tagset_size, -1)
        hidden_states_predicate = hidden_states_predicate.view(
            self.batch_size, -1, 1)
        tag_space = torch.bmm(left_part, hidden_states_predicate).view(
            len(sentence[0]) * self.batch_size, -1)
        SRLprobs = F.softmax(tag_space, dim=1)

        loss_function = nn.CrossEntropyLoss(ignore_index=0)

        SRLloss = loss_function(tag_space, targets.view(-1))

        IDloss = loss_function(Predicate_Identification_Space,
                               P_identification.view(-1))

        return SRLloss, IDloss, IDloss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums,  \
               right_noNull_predict, noNull_predict, noNUll_truth,\
               right_noNull_predict, noNull_predict, noNUll_truth
Exemplo n.º 33
0
    def forward(self, src_tokens, src_lengths):
        if self.left_pad:
            # convert left-padding to right-padding
            src_tokens = utils.convert_padding_direction(
                src_tokens, self.padding_idx, left_to_right=True
            )
        if self.word_dropout_module is not None:
            src_tokens = self.word_dropout_module(src_tokens)
        bsz, seqlen = src_tokens.size()

        # embed tokens
        x = self.embed_tokens(src_tokens)
        x = F.dropout(x, p=self.dropout_in, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)
        embedded_words = x

        # Generate packed seq to deal with varying source seq length
        packed_input, batch_sizes = pack_padded_sequence(x, src_lengths)
        final_hiddens, final_cells = [], []
        next_hiddens = []
        for i, rnn_layer in enumerate(self.layers):
            current_hidden_size = (
                self.hidden_dim // 2 if rnn_layer.is_bidirectional else self.hidden_dim
            )

            if self.cell_type in ["lstm", "milstm", "layer_norm_lstm"]:
                prev_hidden = (
                    x.new(bsz, current_hidden_size).zero_(),
                    x.new(bsz, current_hidden_size).zero_(),
                )
            else:
                raise Exception(f"{self.cell_type} not implemented")

            hidden, current_output = rnn_layer.forward(
                packed_input, prev_hidden, batch_sizes
            )
            next_hiddens.append(hidden)
            prev_hidden = next_hiddens[-1]

            if self.dropout_out != 0:
                current_output = F.dropout(
                    current_output, p=self.dropout_out, training=self.training
                )

            if self.residual_level is not None and i >= self.residual_level:
                packed_input = packed_input.clone() + current_output
            else:
                packed_input = current_output

        final_hiddens, final_cells = zip(*next_hiddens)
        # Reshape to [num_layer, batch_size, hidden_dim]
        final_hiddens = torch.cat(final_hiddens, dim=0).view(
            self.num_layers, *final_hiddens[0].size()
        )
        final_cells = torch.cat(final_cells, dim=0).view(
            self.num_layers, *final_cells[0].size()
        )

        #  [max_seqlen, batch_size, hidden_dim]
        unpacked_output, _ = pad_packed_sequence(
            PackedSequence(packed_input, batch_sizes), padding_value=self.padding_value
        )

        return (
            unpacked_output,
            final_hiddens,
            final_cells,
            src_lengths,
            src_tokens,
            embedded_words,
        )
Exemplo n.º 34
0
    def sort_and_run_forward(self,
                             module: Callable[[PackedSequence, Optional[RnnState]],
                                              Tuple[Union[PackedSequence, torch.Tensor], RnnState]],
                             inputs: torch.Tensor,
                             mask: torch.Tensor,
                             hidden_state: Optional[RnnState] = None):
        """
        This function exists because Pytorch RNNs require that their inputs be sorted
        before being passed as input. As all of our Seq2xxxEncoders use this functionality,
        it is provided in a base class. This method can be called on any module which
        takes as input a ``PackedSequence`` and some ``hidden_state``, which can either be a
        tuple of tensors or a tensor.
        As all of our Seq2xxxEncoders have different return types, we return `sorted`
        outputs from the module, which is called directly. Additionally, we return the
        indices into the batch dimension required to restore the tensor to it's correct,
        unsorted order and the number of valid batch elements (i.e the number of elements
        in the batch which are not completely masked). This un-sorting and re-padding
        of the module outputs is left to the subclasses because their outputs have different
        types and handling them smoothly here is difficult.
        Parameters
        ----------
        module : ``Callable[[PackedSequence, Optional[RnnState]],
                            Tuple[Union[PackedSequence, torch.Tensor], RnnState]]``, required.
            A function to run on the inputs. In most cases, this is a ``torch.nn.Module``.
        inputs : ``torch.Tensor``, required.
            A tensor of shape ``(batch_size, sequence_length, embedding_size)`` representing
            the inputs to the Encoder.
        mask : ``torch.Tensor``, required.
            A tensor of shape ``(batch_size, sequence_length)``, representing masked and
            non-masked elements of the sequence for each element in the batch.
        hidden_state : ``Optional[RnnState]``, (default = None).
            A single tensor of shape (num_layers, batch_size, hidden_size) representing the
            state of an RNN with or a tuple of
            tensors of shapes (num_layers, batch_size, hidden_size) and
            (num_layers, batch_size, memory_size), representing the hidden state and memory
            state of an LSTM-like RNN.
        Returns
        -------
        module_output : ``Union[torch.Tensor, PackedSequence]``.
            A Tensor or PackedSequence representing the output of the Pytorch Module.
            The batch size dimension will be equal to ``num_valid``, as sequences of zero
            length are clipped off before the module is called, as Pytorch cannot handle
            zero length sequences.
        final_states : ``Optional[RnnState]``
            A Tensor representing the hidden state of the Pytorch Module. This can either
            be a single tensor of shape (num_layers, num_valid, hidden_size), for instance in
            the case of a GRU, or a tuple of tensors, such as those required for an LSTM.
        restoration_indices : ``torch.LongTensor``
            A tensor of shape ``(batch_size,)``, describing the re-indexing required to transform
            the outputs back to their original batch order.
        """
        # In some circumstances you may have sequences of zero length. ``pack_padded_sequence``
        # requires all sequence lengths to be > 0, so remove sequences of zero length before
        # calling self._module, then fill with zeros.

        # First count how many sequences are empty.
        batch_size = mask.size(0)
        num_valid = torch.sum(mask[:, 0]).int().item()

        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\
            sort_batch_by_length(inputs, sequence_lengths)
        # Now create a PackedSequence with only the non-empty, sorted sequences.
        packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :],
                                                     sorted_sequence_lengths[:num_valid].data.tolist(),
                                                     batch_first=True)
        # Prepare the initial states.
        if not self.stateful:
            if hidden_state is None:
                initial_states = hidden_state
            elif isinstance(hidden_state, tuple):
                initial_states = [state.index_select(1, sorting_indices)[:, :num_valid, :]
                                  for state in hidden_state]
            else:
                initial_states = hidden_state.index_select(1, sorting_indices)[:, :num_valid, :]

        else:
            initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices)
        # Actually call the module on the sorted PackedSequence.
        module_output, final_states = module(packed_sequence_input, initial_states)

        return module_output, final_states, restoration_indices
    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute
        X = self.model_embeddings.source(source_padded)
        X = pack_padded_sequence(input=X, lengths=source_lengths)

        enc_hiddens, (last_hidden, last_cell) = self.encoder(X)
        enc_hiddens = pad_packed_sequence(enc_hiddens)[0].permute(1, 0, 2)

        h = torch.cat([last_hidden[0], last_hidden[1]], dim=1)
        init_decoder_hidden = self.h_projection(h)

        c = torch.cat([last_cell[0], last_cell[1]], dim=1)
        init_decoder_cell = self.c_projection(c)

        dec_init_state = (init_decoder_hidden, init_decoder_cell)
        ### END YOUR CODE

        return enc_hiddens, dec_init_state
Exemplo n.º 36
0
    def forward(self, sentence, p_sentence,  pos_tags, lengths, target_idx_in, region_marks,
                local_roles_voc, frames, local_roles_mask,
                sent_pred_lemmas_idx,  dep_tags,  dep_heads, targets, predicate_identification, all_l_ids,
                Predicate_link, Predicate_Labels_nd, Predicate_Labels,
                unlabeled_sentence_in=False, p_unlabeled_sentence_in=False, unlabeled_sen_lengths = False,test=False, cvt_train=False):

        """
        elmo_embedding_0 = self.elmo_embeddings_0(sentence).view(self.batch_size, len(sentence[0]), 1024)
        elmo_embedding_1 = self.elmo_embeddings_1(sentence).view(self.batch_size, len(sentence[0]), 1024)
        w = F.softmax(self.elmo_word, dim=0)
        elmo_emb = self.elmo_gamma_word * (w[0] * elmo_embedding_0 + w[1] * elmo_embedding_1)
        elmo_emb_word = self.elmo_mlp_word(elmo_emb)
        """

        region_marks = self.region_embeddings(region_marks).view(self.batch_size, len(sentence[0]), 16)
        fixed_embeds = self.word_fixed_embeddings(p_sentence)
        fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]), self.word_emb_dim)
        sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx)
        embeds_SRL = self.word_embeddings_SRL(sentence)
        embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim)
        pos_embeds = self.pos_embeddings(pos_tags)
        SRL_hidden_states = torch.cat((embeds_SRL,  fixed_embeds, region_marks), 2)
        SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states)


        # SRL layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(SRL_hidden_states, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort.cpu().numpy(), batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_4 = self.BiLSTM_SRL(embeds_sort, self.hidden_4)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states = hidden_states[unsort_idx]
        hidden_states = self.hidden_state_dropout(hidden_states)


        # B * H
        hidden_states_3 = hidden_states
        hidden_states_word = self.dropout_1(F.relu(self.Non_Predicate_Proj(hidden_states_3)))
        predicate_embeds = hidden_states_3[np.arange(0, hidden_states_3.size()[0]), target_idx_in]
        added_embeds = torch.zeros(1, hidden_states_3.size()[0], hidden_states_3.size()[2]).to(
            device)
        predicate_embeds = added_embeds + predicate_embeds
        # B * T * H
        predicate_embeds = predicate_embeds.transpose(0, 1)
        hidden_states_predicate = self.dropout_2(F.relu(self.Predicate_Proj(predicate_embeds)))


        tag_space = self.rel_biaffine(hidden_states_word, hidden_states_predicate).view(self.batch_size*len(sentence[0]), self.tagset_size)
        SRLprobs = F.softmax(tag_space, dim=1)

        # +++++++++++++++++++++++
        wrong_l_nums = 0.0
        all_l_nums = 0.0

        right_noNull_predict = 10.0
        noNull_predict = 10.0
        noNUll_truth = 10.0






        loss_function = nn.CrossEntropyLoss(ignore_index=0)

        SRLloss = loss_function(tag_space, targets.view(-1))

        return SRLloss, SRLloss, SRLloss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums,  \
               right_noNull_predict, noNull_predict, noNUll_truth,\
               right_noNull_predict, noNull_predict, noNUll_truth
Exemplo n.º 37
0
    def forward(self, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, wmaps, tmaps,
                wmap_lengths, cmap_lengths, pos_mask):
        """
        Forward propagation.

        :param cmaps_f: padded encoded forward character sequences, a tensor of dimensions (batch_size, char_pad_len)
        :param cmaps_b: padded encoded backward character sequences, a tensor of dimensions (batch_size, char_pad_len)
        :param cmarkers_f: padded forward character markers, a tensor of dimensions (batch_size, word_pad_len)
        :param cmarkers_b: padded backward character markers, a tensor of dimensions (batch_size, word_pad_len)
        :param wmaps: padded encoded word sequences, a tensor of dimensions (batch_size, word_pad_len)
        :param tmaps: padded tag sequences, a tensor of dimensions (batch_size, word_pad_len)
        :param wmap_lengths: word sequence lengths, a tensor of dimensions (batch_size)
        :param cmap_lengths: character sequence lengths, a tensor of dimensions (batch_size, word_pad_len)
        """
        self.batch_size = cmaps_f.size(0)
        self.word_pad_len = wmaps.size(1)

        # Sort by decreasing true char. sequence length
        cmap_lengths, char_sort_ind = cmap_lengths.sort(dim=0, descending=True)
        cmaps_f = cmaps_f[char_sort_ind]
        cmaps_b = cmaps_b[char_sort_ind]
        cmarkers_f = cmarkers_f[char_sort_ind]
        cmarkers_b = cmarkers_b[char_sort_ind]
        wmaps = wmaps[char_sort_ind]
        tmaps = tmaps[char_sort_ind]
        pos_mask = pos_mask[char_sort_ind]
        wmap_lengths = wmap_lengths[char_sort_ind]

        # Embedding look-up for characters
        cf = self.char_embeds(
            cmaps_f)  # (batch_size, char_pad_len, char_emb_dim)
        cb = self.char_embeds(cmaps_b)

        # Dropout
        cf = self.dropout(cf)  # (batch_size, char_pad_len, char_emb_dim)
        cb = self.dropout(cb)

        # Pack padded sequence
        cf = pack_padded_sequence(
            cf, cmap_lengths.tolist(), batch_first=True
        )  # packed sequence of char_emb_dim, with real sequence lengths
        cb = pack_padded_sequence(cb, cmap_lengths.tolist(), batch_first=True)

        # LSTM
        cf, _ = self.forw_char_lstm(
            cf)  # packed sequence of char_rnn_dim, with real sequence lengths
        cb, _ = self.back_char_lstm(cb)

        # Unpack packed sequence
        cf, _ = pad_packed_sequence(
            cf, batch_first=True
        )  # (batch_size, max_char_len_in_batch, char_rnn_dim)
        cb, _ = pad_packed_sequence(cb, batch_first=True)

        # Sanity check
        assert cf.size(1) == max(
            cmap_lengths.tolist()) == list(cmap_lengths)[0]

        # Select RNN outputs only at marker points (spaces in the character sequence)
        cmarkers_f = cmarkers_f.unsqueeze(2).expand(self.batch_size,
                                                    self.word_pad_len,
                                                    self.char_rnn_dim)
        cmarkers_b = cmarkers_b.unsqueeze(2).expand(self.batch_size,
                                                    self.word_pad_len,
                                                    self.char_rnn_dim)
        cf_selected = torch.gather(
            cf, 1, cmarkers_f)  # (batch_size, word_pad_len, char_rnn_dim)
        cb_selected = torch.gather(cb, 1, cmarkers_b)

        # Only for co-training, not useful for tagging after model is trained
        if self.training:
            lm_f = self.forw_lm_hw(self.dropout(
                cf_selected))  # (batch_size, word_pad_len, char_rnn_dim)
            lm_b = self.back_lm_hw(self.dropout(cb_selected))
            lm_f_scores = self.forw_lm_out(self.dropout(
                lm_f))  # (batch_size, word_pad_len, lm_vocab_size)
            lm_b_scores = self.back_lm_out(self.dropout(lm_b))

        # Sort by decreasing true word sequence length
        wmap_lengths, word_sort_ind = wmap_lengths.sort(dim=0, descending=True)
        wmaps = wmaps[word_sort_ind]
        tmaps = tmaps[word_sort_ind]
        pos_mask = pos_mask[word_sort_ind]

        cf_selected = cf_selected[word_sort_ind]  # for language model
        cb_selected = cb_selected[word_sort_ind]
        if self.training:
            lm_f_scores = lm_f_scores[word_sort_ind]
            lm_b_scores = lm_b_scores[word_sort_ind]

        # Embedding look-up for words
        w = self.word_embeds(wmaps)  # (batch_size, word_pad_len, word_emb_dim)
        w = self.dropout(w)

        # Sub-word information at each word
        subword = self.subword_hw(
            self.dropout(torch.cat(
                (cf_selected, cb_selected),
                dim=2)))  # (batch_size, word_pad_len, 2 * char_rnn_dim)
        subword = self.dropout(subword)

        # Concatenate word embeddings and sub-word features
        w = torch.cat(
            (w, subword), dim=2
        )  # (batch_size, word_pad_len, word_emb_dim + 2 * char_rnn_dim)

        # Concatenate pos tag and word embeddings
        if self.use_pos_mask:
            pos_mask = pos_mask.unsqueeze(2).to(self.device)
            w = torch.cat((w, pos_mask), dim=2)

        # Pack padded sequence
        w = pack_padded_sequence(
            w, list(wmap_lengths), batch_first=True
        )  # packed sequence of word_emb_dim + 2 * char_rnn_dim, with real sequence lengths

        # LSTM
        w, _ = self.word_blstm(
            w)  # packed sequence of word_rnn_dim, with real sequence lengths

        # Unpack packed sequence
        w, _ = pad_packed_sequence(
            w, batch_first=True
        )  # (batch_size, max_word_len_in_batch, word_rnn_dim)
        w = self.dropout(w)

        crf_scores = self.crf(
            w)  # (batch_size, max_word_len_in_batch, tagset_size, tagset_size)

        if self.training:
            return crf_scores, lm_f_scores, lm_b_scores, wmaps, tmaps, wmap_lengths, word_sort_ind, char_sort_ind
        else:
            return crf_scores, wmaps, tmaps, wmap_lengths, word_sort_ind, char_sort_ind  # sort inds to reorder, if req.
Exemplo n.º 38
0
    def forward(self, pair, premise_len, hypothesis_len, mask_id, seg_id):
        batch_size = pair.shape[0]

        # feed the pair token ids into BertModel
        pair = self.bert(pair, token_type_ids=seg_id,
                         attention_mask=mask_id)[0]
        pair = self.dropout_emb(pair)
        premise = [
            torch.tensor(pair[i][1:2 + premise_len[i]])
            for i in range(batch_size)
        ]  # including the end [SEP]
        hypothesis = [
            torch.tensor(pair[i][2 + premise_len[i]:2 + premise_len[i] +
                                 hypothesis_len[i]]) for i in range(batch_size)
        ]

        premise = pad_sequence(premise, batch_first=True)
        hypothesis = pad_sequence(hypothesis, batch_first=True)

        # premise
        prem_max_len = premise.shape[1]
        premise_len += 1  # we add 1 for the ending [SEP]. This is only for the premise but not the hypothesis
        premise_len, p_idxes = torch.sort(premise_len, descending=True)
        _, p_idx_unsort = torch.sort(
            p_idxes)  # in order to restore the original order
        premise = premise[p_idxes]
        packed_premise = pack_padded_sequence(premise,
                                              premise_len,
                                              batch_first=True)
        # (max_len, batch_size, hidden_size)
        h_s, (_, _) = self.lstm_prem(packed_premise)
        h_s, _ = pad_packed_sequence(h_s)
        h_s = h_s[:,
                  p_idx_unsort]  # because we have two sentences here, we need to restore the order to ensuring matching

        # hypothesis
        # hypothesis = hypothesis.to(self.device)
        hypothesis_max_len = hypothesis.shape[1]
        hypothesis_len, h_idxes = torch.sort(hypothesis_len, descending=True)
        _, h_idx_unsort = torch.sort(h_idxes)
        hypothesis = hypothesis[h_idxes]
        packed_hypothesis = pack_padded_sequence(hypothesis,
                                                 hypothesis_len,
                                                 batch_first=True)
        # (max_len, batch_size, hidden_size)
        h_t, (_, _) = self.lstm_hypo(packed_hypothesis)
        h_t, _ = pad_packed_sequence(h_t)
        h_t = h_t[:, h_idx_unsort]
        hypothesis_len = hypothesis_len[
            h_idx_unsort]  # because we have two sentences here, we need to restore the order to ensuring matching

        # matchLSTM. This is the core of this paper.
        batch_size = premise.shape[0]
        h_m_k = torch.zeros((batch_size, self.config.hidden_size),
                            device=self.device)
        c_m_k = torch.zeros((batch_size, self.config.hidden_size),
                            device=self.device)
        h_last = torch.zeros((batch_size, self.config.hidden_size),
                             device=self.device)

        for k in range(hypothesis_max_len):
            h_t_k = h_t[k]

            # Equation (6)
            # e_kj: (prem_max_len, batch_size)
            e_kj = torch.zeros((prem_max_len, batch_size), device=self.device)
            w_e_expand = self.w_e.expand(batch_size, self.config.hidden_size)
            for j in range(prem_max_len):
                # tanh_stm: (batch_size, hidden_size)
                tanh_s_t_m = torch.tanh(
                    self.w_s(h_s[j]) + self.w_t(h_t_k) + self.w_m(h_m_k))

                # dot product
                # https://github.com/pytorch/pytorch/issues/18027#issuecomment-473404765
                e_kj[j] = (w_e_expand * tanh_s_t_m).sum(-1)

            # Equation (3)
            # (prem_max_len, batch_size)
            alpha_kj = F.softmax(e_kj, dim=0)

            # Equation (2)
            # (batch_size, hidden_size)
            a_k = torch.bmm(torch.unsqueeze(alpha_kj.t(), 1),
                            h_s.permute(1, 0, 2))
            a_k = torch.squeeze(a_k, dim=1)

            # Equation (7)
            # (batch_size, 2 * hidden_size)
            m_k = torch.cat((a_k, h_t_k), 1)

            # Equation (8)
            # (batch_size, hidden_size)
            h_m_k, c_m_k = self.lstm_match(m_k, (h_m_k, c_m_k))

            # handle variable length sequences: hypothesis
            # (batch_size)
            for batch_idx, hl in enumerate(hypothesis_len):
                if k + 1 == hl:
                    h_last[batch_idx] = h_m_k[batch_idx]

        h_last = self.dropout_fc(h_last)

        return self.fc(h_last)
def validate(val_loader, encoder, decoder, criterion):
    """
    Performs one epoch's validation.

    :param val_loader: DataLoader for validation data.
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :return: BLEU-4 score
    """
    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list()  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # explicitly disable gradient calculation to avoid CUDA memory error
    # solves the issue #57
    with torch.no_grad():
        # Batches
        for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):

            # Move to device, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            # Forward prop.
            if encoder is not None:
                imgs = encoder(imgs)
            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores_copy = scores.clone()
            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0]
            targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]

            # Calculate loss
            loss = criterion(scores, targets)

            # Add doubly stochastic attention regularization
            loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

            # Keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time,
                                                                                loss=losses, top5=top5accs))

            # Store references (true captions), and hypothesis (prediction) for each image
            # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
            # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]

            # References
            allcaps = allcaps[sort_ind]  # because images were sorted in the decoder
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                img_captions = list(
                    map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}],
                        img_caps))  # remove <start> and pads
                references.append(img_captions)

            # Hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads
            preds = temp_preds
            hypotheses.extend(preds)

            assert len(references) == len(hypotheses)

        # Calculate BLEU-4 scores
        bleu4 = corpus_bleu(references, hypotheses)

        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'.format(
                loss=losses,
                top5=top5accs,
                bleu=bleu4))

    return bleu4
def train(train_loader, encoder, decoder, criterion, encoder_optimizer,
          decoder_optimizer, epoch):
    """
    Performs one epoch's training.

    :param train_loader: DataLoader for training data
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning)
    :param decoder_optimizer: optimizer to update decoder's weights
    :param epoch: epoch number
    """

    decoder.train()  # train mode (dropout and batchnorm is used)
    encoder.train()

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss (per word decoded)
    top5accs = AverageMeter()  # top5 accuracy

    start = time.time()

    # Batches
    for i, (imgs, caps, caplens) in enumerate(train_loader):
        data_time.update(time.time() - start)

        # Move to GPU, if available
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)

        # Forward prop.
        imgs = encoder(imgs)
        scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(
            imgs, caps, caplens)

        # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
        targets = caps_sorted[:, 1:]

        # Remove timesteps that we didn't decode at, or are pads
        # pack_padded_sequence is an easy trick to do this
        scores = pack_padded_sequence(scores, decode_lengths,
                                      batch_first=True)[0]
        targets = pack_padded_sequence(targets,
                                       decode_lengths,
                                       batch_first=True)[0]

        # Calculate loss
        loss = criterion(scores, targets)

        # Add doubly stochastic attention regularization
        loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean()

        # Back prop.
        decoder_optimizer.zero_grad()
        if encoder_optimizer is not None:
            encoder_optimizer.zero_grad()
        loss.backward()

        # Clip gradients
        if grad_clip is not None:
            clip_gradient(decoder_optimizer, grad_clip)
            if encoder_optimizer is not None:
                clip_gradient(encoder_optimizer, grad_clip)

        # Update weights
        decoder_optimizer.step()
        if encoder_optimizer is not None:
            encoder_optimizer.step()

        # Keep track of metrics
        top5 = accuracy(scores, targets, 5)
        losses.update(loss.item(), sum(decode_lengths))
        top5accs.update(top5, sum(decode_lengths))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(
                      epoch,
                      i,
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses,
                      top5=top5accs))
Exemplo n.º 41
0
    def forward(self, src_tokens, src_lengths):
        if self.left_pad:
            # convert left-padding to right-padding
            src_tokens = utils.convert_padding_direction(
                src_tokens, self.padding_idx, left_to_right=True
            )

        # If we're generating adversarial examples we need to keep track of
        # some internal variables
        self.tracker.reset()

        if self.word_dropout_module is not None:
            src_tokens = self.word_dropout_module(src_tokens)

        bsz, seqlen = src_tokens.size()

        # embed tokens
        x = self.embed_tokens(src_tokens)
        # Track token embeddings
        self.tracker.track(x, "token_embeddings", retain_grad=self.track_gradients)

        x = F.dropout(x, p=self.dropout_in, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)
        embedded_words = x

        # Allows compatibility with Caffe2 inputs for tracing (int32)
        # as well as the current format of Fairseq-Py inputs (int64)
        if src_lengths.dtype is torch.int64:
            src_lengths = src_lengths.int()

        # Generate packed seq to deal with varying source seq length
        # packed_input is of type PackedSequence, which consists of:
        # element [0]: a tensor, the packed data, and
        # element [1]: a list of integers, the batch size for each step
        packed_input = pack_padded_sequence(x, src_lengths)

        final_hiddens, final_cells = [], []
        for i, rnn_layer in enumerate(self.layers):
            if self.bidirectional and i == 0:
                h0 = x.new(2, bsz, self.hidden_dim // 2).zero_()
                c0 = x.new(2, bsz, self.hidden_dim // 2).zero_()
            else:
                h0 = x.new(1, bsz, self.hidden_dim).zero_()
                c0 = x.new(1, bsz, self.hidden_dim).zero_()

            # apply LSTM along entire sequence
            current_output, (h_last, c_last) = rnn_layer(packed_input, (h0, c0))

            # final state shapes: (bsz, hidden_dim)
            if self.bidirectional and i == 0:
                # concatenate last states for forward and backward LSTM
                h_last = torch.cat((h_last[0, :, :], h_last[1, :, :]), dim=1)
                c_last = torch.cat((c_last[0, :, :], c_last[1, :, :]), dim=1)
            else:
                h_last = h_last.squeeze(dim=0)
                c_last = c_last.squeeze(dim=0)

            final_hiddens.append(h_last)
            final_cells.append(c_last)

            if self.residual_level is not None and i >= self.residual_level:
                packed_input[0] = packed_input.clone()[0] + current_output[0]
            else:
                packed_input = current_output

        # Reshape to [num_layer, batch_size, hidden_dim]
        final_hiddens = torch.cat(final_hiddens, dim=0).view(
            self.num_layers, *final_hiddens[0].size()
        )
        final_cells = torch.cat(final_cells, dim=0).view(
            self.num_layers, *final_cells[0].size()
        )

        #  [max_seqlen, batch_size, hidden_dim]
        unpacked_output, _ = pad_packed_sequence(
            packed_input, padding_value=self.padding_value
        )

        return (
            unpacked_output,
            final_hiddens,
            final_cells,
            src_lengths,
            src_tokens,
            embedded_words,
        )
    def forward(self, inputs, input_raw, hidden=None):
        """
        forward
        """
        if isinstance(inputs, tuple):
            inputs, lengths = inputs
        else:
            inputs, lengths = inputs, None

        if self.embedder is not None:
            rnn_inputs = self.embedder(inputs)
        else:
            rnn_inputs = inputs

        elmo_embed = self.elmo_embedder.sents2elmo(input_raw)
        elmo_length = [x.shape[0] for x in elmo_embed]
        batch_size_1 = len(elmo_length)
        max_l = max(elmo_length)
        size = (batch_size_1, max_l, 1024)
        tensor_1 = torch.zeros(size, dtype=torch.float)
        for i in range(batch_size_1):
            tensor_1[i][:elmo_length[i]] = torch.tensor(elmo_embed[i])

        elmo_embed = tensor_1.cuda()
        rnn_inputs = torch.cat([rnn_inputs, elmo_embed], dim=-1)

        batch_size = rnn_inputs.size(0)

        if lengths is not None:
            num_valid = lengths.gt(0).int().sum().item()
            sorted_lengths, indices = lengths.sort(descending=True)
            rnn_inputs = rnn_inputs.index_select(0, indices)

            rnn_inputs = pack_padded_sequence(
                rnn_inputs[:num_valid],
                sorted_lengths[:num_valid].tolist(),
                batch_first=True)

            if hidden is not None:
                hidden = hidden.index_select(1, indices)[:, :num_valid]

        outputs, last_hidden = self.rnn(rnn_inputs, hidden)

        if self.bidirectional:
            last_hidden = self._bridge_bidirectional_hidden(last_hidden)

        if lengths is not None:
            outputs, _ = pad_packed_sequence(outputs, batch_first=True)

            if num_valid < batch_size:
                zeros = outputs.new_zeros(batch_size - num_valid,
                                          outputs.size(1), self.hidden_size)
                outputs = torch.cat([outputs, zeros], dim=0)

                zeros = last_hidden.new_zeros(self.num_layers,
                                              batch_size - num_valid,
                                              self.hidden_size)
                last_hidden = torch.cat([last_hidden, zeros], dim=1)

            _, inv_indices = indices.sort()
            outputs = outputs.index_select(0, inv_indices)
            last_hidden = last_hidden.index_select(1, inv_indices)

        return outputs, last_hidden
def train_encoder():
    #     transform = torchvision.transforms.Compose([torchvision.transforms.RandomCrop(224,224,pad_if_needed=True)])
    #     transform = torchvision.transforms.Compose([torchvision.transforms.RandomCrop(512,512,pad_if_needed=True)])
    transform = transforms.Compose([
        transforms.Resize((512, 512), 2),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        transforms.ToPILImage()
    ])
    train_loader = data_loader.get_loader(
        './data/images/train/', './data/annotations/captions_train2014.json',
        traincaption_ids, vocab, transform, 64, True, 0)
    val_loader = data_loader.get_loader(
        './data/images/train/', './data/annotations/captions_train2014.json',
        valcaption_ids, vocab, transform, 64, True, 0)

    criterion = nn.CrossEntropyLoss()
    encoder = EncoderCNN(embed_size=256).cuda()
    decoder = DecoderRNN(embed_size=256,
                         hidden_size=512,
                         vocab_size=len(vocab),
                         num_layers=1).cuda()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())

    optimizer = torch.optim.Adam(params, lr=1e-3)
    loss_train = []
    loss_val = []
    outputs = None
    f = open('train.txt', 'a')
    f_val = open('val.txt', 'a')
    encoded_output = None
    for epoch in range(0, 1000):
        for i, (images, captions, lengths) in enumerate(train_loader):
            if i % 50 == 0:
                print('Training epoch {}, iteration {}'.format(epoch, i))
            images = images.cuda()
            captions = captions.cuda()

            encoded_output = encoder.forward(images)
            outputs = decoder(encoded_output, captions, lengths)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            loss = criterion(outputs, targets)

            if i % 100 == 0:
                loss_train.append(loss.item())

            decoder.zero_grad()
            encoder.zero_grad()

            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                f.write('Epoch: {}, Iter: {}, Train loss: {}\n'.format(
                    epoch, i, loss_train))
                f.flush()

            encoder.train()
            decoder.train()

            if i % 200 == 0:
                with torch.no_grad():
                    val_loss = 0
                    print('Validation for epoch {}'.format(epoch))
                    f_val.write('Epoch {}\n'.format(epoch))
                    for j, (images, captions,
                            lengths) in enumerate(val_loader):
                        images = images.cuda()
                        captions = captions.cuda()
                        encoded_output = encoder.forward(images)
                        outputs = decoder(encoded_output, captions, lengths)
                        targets = pack_padded_sequence(captions,
                                                       lengths,
                                                       batch_first=True)[0]
                        val_loss += criterion(outputs, targets).item()
                        sampled_ids = decoder.sample(
                            encoded_output).cpu().numpy()
                        if j == 3:
                            for item in range(0, 5):
                                t = np.random.randint(0, len(sampled_ids))
                                sampled_caption = []
                                actual_caption = []
                                for word_id_2 in sampled_ids[t]:
                                    word = vocab.idx2word[word_id_2]
                                    sampled_caption.append(word)
                                    if word == '<end>':
                                        break
                                for word_id_2 in captions[t].cpu().numpy():
                                    word = vocab.idx2word[word_id_2]
                                    actual_caption.append(word)
                                    if word == '<end>':
                                        break
                                sentence = ' '.join(sampled_caption)
                                actual_sentence = ' '.join(actual_caption)
                                f_val.write(
                                    'Generated Caption: {} \n Actual Caption: {}\n\n'
                                    .format(str(sampled_caption),
                                            str(actual_caption)))
                                f_val.flush()
                    val_loss /= j
                    loss_val.append(val_loss)
                    f_val.write('Loss for epoch {}: {}'.format(
                        epoch, loss_val))
                    f_val.flush()

            if i % 500 == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join('./models/decoder-{}-{}.ckpt'.format(
                        epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join('./models/encoder-{}-{}.ckpt'.format(
                        epoch + 1, i + 1)))
    f.close()
    f_val.close()
Exemplo n.º 44
0
    def forward(self, query, keys, keys_length, mask=None):
        """
        Parameters
        ----------
        query: 2D tensor, [B, H]
        keys: (masked_interests), 3D tensor, [b, T, H]
        keys_length: 1D tensor, [B]

        Returns
        -------
        outputs: 2D tensor, [B, H]
        """
        batch_size, dim = query.size()
        max_length = keys.size()[1]

        # check batch validation
        zero_outputs = torch.zeros(batch_size, dim, device=query.device)
        mask = keys_length > 0
        # [B] -> [b]
        keys_length = keys_length[mask]
        if keys_length.shape[0] == 0:
            return zero_outputs

        # [B, H] -> [b, 1, H]
        query = torch.masked_select(query, mask.view(-1,
                                                     1)).view(-1,
                                                              dim).unsqueeze(1)

        if self.gru_type == 'GRU':
            packed_keys = pack_padded_sequence(keys,
                                               lengths=keys_length,
                                               batch_first=True,
                                               enforce_sorted=False)
            packed_interests, _ = self.interest_evolution(packed_keys)
            interests, _ = pad_packed_sequence(packed_interests,
                                               batch_first=True,
                                               padding_value=0.0,
                                               total_length=max_length)
            outputs = self.attention(query, interests,
                                     keys_length.unsqueeze(1))  # [b, 1, H]
            outputs = outputs.squeeze(1)  # [b, H]
        elif self.gru_type == 'AIGRU':
            att_scores = self.attention(query, keys,
                                        keys_length.unsqueeze(1))  # [b, 1, T]
            interests = keys * att_scores.transpose(1, 2)  # [b, T, H]
            packed_interests = pack_padded_sequence(interests,
                                                    lengths=keys_length,
                                                    batch_first=True,
                                                    enforce_sorted=False)
            _, outputs = self.interest_evolution(packed_interests)
            outputs = outputs.squeeze(0)  # [b, H]
        elif self.gru_type == 'AGRU' or self.gru_type == 'AUGRU':
            att_scores = self.attention(
                query, keys, keys_length.unsqueeze(1)).squeeze(1)  # [b, T]
            packed_interests = pack_padded_sequence(keys,
                                                    lengths=keys_length,
                                                    batch_first=True,
                                                    enforce_sorted=False)
            packed_scores = pack_padded_sequence(att_scores,
                                                 lengths=keys_length,
                                                 batch_first=True,
                                                 enforce_sorted=False)
            outputs = self.interest_evolution(packed_interests, packed_scores)
            outputs, _ = pad_packed_sequence(outputs,
                                             batch_first=True,
                                             padding_value=0.0,
                                             total_length=max_length)
            # pick last state
            outputs = InterestEvolving._get_last_state(outputs,
                                                       keys_length)  # [b, H]
        # [b, H] -> [B, H]
        zero_outputs[mask] = outputs
        return zero_outputs
Exemplo n.º 45
0
    def _get_instr_embedding(self, instr):
        if self.lang_model == 'gru':
            _, hidden = self.instr_rnn(self.word_embedding(instr))
            return hidden[-1]

        elif self.lang_model in ['bigru', 'attgru']:
            lengths = (instr != 0).sum(1).long()
            masks = (instr != 0).float()

            if lengths.shape[0] > 1:
                seq_lengths, perm_idx = lengths.sort(0, descending=True)
                iperm_idx = torch.LongTensor(perm_idx.shape).fill_(0)
                if instr.is_cuda: iperm_idx = iperm_idx.cuda()
                for i, v in enumerate(perm_idx):
                    iperm_idx[v.data] = i

                inputs = self.word_embedding(instr)
                inputs = inputs[perm_idx]

                inputs = pack_padded_sequence(inputs,
                                              seq_lengths.data.cpu().numpy(),
                                              batch_first=True)

                outputs, final_states = self.instr_rnn(inputs)
            else:
                instr = instr[:, 0:lengths[0]]
                outputs, final_states = self.instr_rnn(
                    self.word_embedding(instr))
                iperm_idx = None
            final_states = final_states.transpose(0, 1).contiguous()
            final_states = final_states.view(final_states.shape[0], -1)
            if iperm_idx is not None:
                outputs, _ = pad_packed_sequence(outputs, batch_first=True)
                outputs = outputs[iperm_idx]
                final_states = final_states[iperm_idx]

            if outputs.shape[1] < masks.shape[1]:
                masks = masks[:, :(outputs.shape[1] - masks.shape[1])]
                # the packing truncated the original length
                # so we need to change mask to fit it

            return outputs if self.lang_model == 'attgru' else final_states

        elif self.lang_model == 'conv':
            inputs = self.word_embedding(instr).unsqueeze(1)  # (B,1,T,D)
            inputs = [
                F.relu(conv(inputs)).squeeze(3) for conv in self.instr_convs
            ]
            inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs]

            return torch.cat(inputs, 1)

        elif self.lang_model == 'bow':
            device = torch.device("cuda" if instr.is_cuda else "cpu")
            input_dim = self.obs_space["instr"]
            input = torch.zeros((instr.size(0), input_dim), device=device)
            idx = torch.arange(instr.size(0), dtype=torch.int64)
            input[idx.unsqueeze(1), instr] = 1.
            return self.instr_bow(input)
        else:
            ValueError("Undefined instruction architecture: {}".format(
                self.use_instr))
        return efactor, images, env_sst, target

    def __len__(self):
        return len(self.efactors)

    def __getitem__(self, idx):
        return self.efactors[idx], self.images[idx], self.env_sst[
            idx], self.targets[idx]


if __name__ == '__main__':
    tc_data = TC_Data_varbatch(years=[2000])
    for minibatch in tc_data.get_batches():
        images, efactors, envsst, targets, batch_len = minibatch
        images = rnn_utils.pack_padded_sequence(images,
                                                batch_len,
                                                batch_first=True)
        efactors = rnn_utils.pack_padded_sequence(efactors,
                                                  batch_len,
                                                  batch_first=True)
        envsst = rnn_utils.pack_padded_sequence(envsst,
                                                batch_len,
                                                batch_first=True)
        targets = rnn_utils.pack_padded_sequence(targets,
                                                 batch_len,
                                                 batch_first=True)

        #      targets = targets[:, -1, :]
        print(envsst)
        print(efactors)
        print(targets)
Exemplo n.º 47
0
 def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> PackedSequence:
     fmap, fmap_length = x
     fmap = fmap.permute(0, 2, 1) if self._permuting else fmap
     return pack_padded_sequence(fmap, fmap_length, batch_first=True, enforce_sorted=False)
Exemplo n.º 48
0
    def forward(self, input_sequence, length):

        batch_size = input_sequence.size(0)
        sorted_lengths, sorted_idx = torch.sort(length, descending=True)
        input_sequence = input_sequence[sorted_idx]

        # ENCODER
        input_embedding = self.embedding(input_sequence)

        packed_input = rnn_utils.pack_padded_sequence(
            input_embedding, sorted_lengths.data.tolist(), batch_first=True)

        _, hidden = self.encoder_rnn(packed_input)

        if self.bidirectional or self.num_layers > 1:
            # flatten hidden state
            if isinstance(hidden, tuple):
                hidden = (hidden[0].view(batch_size, self.hidden_size *
                                         self.hidden_factor), hidden[1])
            else:
                hidden = hidden.view(batch_size,
                                     self.hidden_size * self.hidden_factor)
        else:
            if isinstance(hidden, tuple):
                hidden = (hidden[0].squeeze(), hidden[1])
            else:
                hidden = hidden.squeeze()

        # REPARAMETERIZATION
        if isinstance(hidden, tuple):
            mean = self.hidden2mean(hidden[0])
            logv = self.hidden2logv(hidden[0])
        else:
            mean = self.hidden2mean(hidden)
            logv = self.hidden2logv(hidden)

        std = torch.exp(0.5 * logv)

        z = to_var(torch.randn([batch_size, self.latent_size]))
        z = z * std + mean

        # DECODER
        if isinstance(self.decoder_rnn, nn.LSTM):
            h_t = self.latent2hidden(z)
            c_t = torch.zeros_like(h_t)
            if torch.cuda.is_available():
                c_t = c_t.cuda()
            hidden = (h_t, c_t)
        else:
            hidden = self.latent2hidden(z)

        if self.bidirectional or self.num_layers > 1:
            # unflatten hidden state
            if isinstance(hidden, tuple):
                hidden = (hidden[0].view(self.hidden_factor, batch_size,
                                         self.hidden_size),
                          hidden[1].view(self.hidden_factor, batch_size,
                                         self.hidden_size))
            else:
                hidden = hidden.view(self.hidden_factor, batch_size,
                                     self.hidden_size)
        else:
            if isinstance(hidden, tuple):
                hidden = (hidden[0].unsqueeze(0), hidden[1].unsqueeze(0))
            else:
                hidden = hidden.unsqueeze(0)

        # decoder input
        if self.word_dropout_rate > 0:
            # randomly replace decoder input with <unk>
            prob = torch.rand(input_sequence.size())
            if torch.cuda.is_available():
                prob = prob.cuda()
            prob[(input_sequence.data - self.sos_idx) *
                 (input_sequence.data - self.pad_idx) == 0] = 1
            decoder_input_sequence = input_sequence.clone()
            decoder_input_sequence[
                prob < self.word_dropout_rate] = self.unk_idx
            input_embedding = self.embedding(decoder_input_sequence)
        input_embedding = self.embedding_dropout(input_embedding)
        packed_input = rnn_utils.pack_padded_sequence(
            input_embedding, sorted_lengths.data.tolist(), batch_first=True)

        # decoder forward pass
        outputs, _ = self.decoder_rnn(packed_input, hidden)

        # process outputs
        padded_outputs = rnn_utils.pad_packed_sequence(outputs,
                                                       batch_first=True)[0]
        padded_outputs = padded_outputs.contiguous()
        _, reversed_idx = torch.sort(sorted_idx)
        padded_outputs = padded_outputs[reversed_idx]
        b, s, _ = padded_outputs.size()

        # project outputs to vocab
        logp = nn.functional.log_softmax(self.outputs2vocab(
            padded_outputs.view(-1, padded_outputs.size(2))),
                                         dim=-1)
        logp = logp.view(b, s, self.embedding.num_embeddings)

        return logp, mean, logv, z
Exemplo n.º 49
0
    def forward(self, inputs, initial_state=None, **kwargs):
        is_packed = isinstance(inputs, PackedSequence)
        if self.stack_mode in ['bidirectional']:
            if not initial_state:
                hidden_states = [None] * len(self.lstm_layers) * len(
                    self.lstm_layers[0])
            elif initial_state[0].size()[0] != len(self.lstm_layers) * len(
                    self.lstm_layers[0]):
                raise ValueError(
                    f"initial states does not match the number of layers.")
            else:
                hidden_states = list(
                    zip(initial_state[0].split(1, 0),
                        initial_state[1].split(1, 0)))
        else:
            if not initial_state:
                hidden_states = [None] * len(self.lstm_layers)
            elif initial_state[0].size()[0] != len(self.lstm_layers):
                raise ValueError(
                    f"initial states does not match the number of layers.")
            else:
                hidden_states = list(
                    zip(initial_state[0].split(1, 0),
                        initial_state[1].split(1, 0)))
        # print(f"nndct_inputs:{inputs}")
        output_sequence = inputs

        if self.stack_mode in ['bidirectional']:
            final_h = []
            final_c = []

            for i in range(len(self.lstm_layers)):
                forward_layer = getattr(self, "forward_layer_{}".format(i))
                backward_layer = getattr(self, "backward_layer_{}".format(i))
                forward_output, final_forward_state = forward_layer(
                    output_sequence, hidden_states[i * 2])
                if self.batch_first is not True:
                    output_sequence.transpose_(0, 1)
                backward_output, final_backward_state = backward_layer(
                    output_sequence, hidden_states[i * 2 + 1])
                if is_packed:
                    forward_output, lengths = pad_packed_sequence(
                        forward_output, batch_first=self.batch_first)
                    backward_output, _ = pad_packed_sequence(
                        backward_output, batch_first=self.batch_first)

                # output_sequence = output_sequence.flip(1)
                # backward_output = backward_output.flip(1)
                output_sequence = torch.cat([forward_output, backward_output],
                                            -1)
                if is_packed:
                    output_sequence = pack_padded_sequence(
                        output_sequence, lengths, batch_first=self.batch_first)
                final_h.extend(
                    [final_forward_state[0], final_backward_state[0]])
                final_c.extend(
                    [final_forward_state[1], final_backward_state[1]])
            final_hidden_state = torch.cat(final_h, dim=0)
            final_cell_state = torch.cat(final_c, dim=0)
        else:
            final_states = []
            for i, state in enumerate(hidden_states):
                if self.stack_mode == 'alternating':
                    layer = getattr(
                        self, f"forward_layer_{i}") if i % 2 == 0 else getattr(
                            self, f"backward_layer_{i}")
                else:
                    layer = getattr(self, f"forward_layer_{i}")

                output_sequence, final_state = layer(output_sequence, state)

                # print(f"nndct_layer{i} output:{output_sequence}")
                final_states.append(final_state)

            final_hidden_state, final_cell_state = tuple(
                torch.cat(state_list, 0) for state_list in zip(*final_states))
        # print(f"nndct_final_output:{output_sequence}")
        return output_sequence, (final_hidden_state, final_cell_state)
Exemplo n.º 50
0
        # E_diff_cost = compute_diff_loss(img_embed_com_l2, img_embed_spe_l2)
        # E_reconst_cost = compute_reconst_loss(input_res, img_reconst) + compute_reconst_loss(input_res, img_text_reconst) + compute_reconst_loss(input_res, img_text_agg_reconst)
        # E_cost = 1.0 * E_sim_cost + 2.0 * E_cls_cost + 1.0 * E_diff_cost + 1.0 * E_reconst_cost
        # ##
        # optimizerE.zero_grad()
        # E_cost.backward(retain_graph=True)
        # optimizerE.step()

        ##################################
        # (2) Train Explanation network
        ##################################
        img_binary = torch.sigmoid(img_binary)
        lstm_outputs = netR(input_res, img_embed, img_binary, input_wordID,
                            input_cap_len)
        lstm_targets = pack_padded_sequence(target_wordID,
                                            input_cap_len,
                                            batch_first=True)[0]
        # LSTM loss
        lstm_cost = lstm_criterion(lstm_outputs, lstm_targets)
        R_cost = lstm_cost
        optimizerR.zero_grad()
        R_cost.backward()
        optimizerR.step()
        mean_R_loss = lstm_cost.item()
        # evaluate mode
        # netE.eval()
    # Generalized zero-shot learning
    print('[%d/%d] R_loss: %.4f' % (epoch, opt.nepoch, mean_R_loss))
    # Generate sentence
    # generated_exp = eval_explanation(netR, input_res, data.vocab) ## why it is an error??
    # print(generated_exp[0]['caption'])
Exemplo n.º 51
0
    def forward(self,
                input,
                targets,
                input_lens,
                target_lens,
                return_decoder_all_h=False,
                use_teacher_forcing=False,
                SOS_index=0):
        """
        input shape: (S, N)
        targets shape: (S, N)
        return_decoder_all_h: whether return every sequence value in decoder rnns
        """
        batch_size = input.size()[1]
        emb = embedded_dropout(self.input_embedding,
                               input,
                               dropout=self.dropoute if self.training else 0)
        emb = self.lockdrop(emb, self.dropouti)
        # emb shape: (S, N, emsize)
        encoder_hidden = self.init_hidden(input.size()[1])
        packed_emb = pack_padded_sequence(emb,
                                          input_lens,
                                          batch_first=False,
                                          enforce_sorted=False)
        encoder_outputs, encoder_hidden = self.encoder_rnns(
            packed_emb, encoder_hidden)
        encoder_outputs, _ = pad_packed_sequence(encoder_outputs)
        encoder_outputs = self.lockdrop(encoder_outputs, self.dropout)
        # encoder_outputs shape: (S, N, nhid)
        # encoder_hidden shape: (nlayers*directions, N, nhid)
        decoder_rnns_output_list = []
        decoder_rnns_h_list = []
        decoder_input = self.input_embedding.weight.new_full(
            [1, input.size()[1]], SOS_index, dtype=torch.long)
        # decoder_input shape: (1, N)
        decoder_hidden = encoder_hidden
        for seq_index in range(input.size()[0]):
            decoder_input = self.target_embedding(decoder_input)
            h_n_batchfirst = decoder_hidden.transpose(0, 1)
            h_n_batchfirst = h_n_batchfirst.reshape(batch_size, -1)
            # h_n_batchfirst shape: (N, nlayers*directions*nhid)
            attn_weights = F.softmax(
                self.attn(
                    torch.cat((decoder_input.view(
                        -1,
                        decoder_input.size()[2]), h_n_batchfirst),
                              dim=1))[:, :encoder_outputs.size()[0]])
            # attn_weights shape: (N, S)
            attn_applied = torch.bmm(attn_weights.unsqueeze(1),
                                     encoder_outputs.transpose(0, 1))
            # attn_applied shape: N, 1, nhid
            attn_combine_output = F.relu(
                self.attn_combine(
                    torch.cat((decoder_input.view(-1,
                                                  decoder_input.size()[2]),
                               attn_applied.view(attn_applied.size()[0],
                                                 attn_applied.size()[2])),
                              dim=1)))
            # attn_combine_output shape: N, nhid
            decoder_rnns_output, decoder_hidden = self.decoder_rnns(
                attn_combine_output.unsqueeze(0), decoder_hidden)
            # decoder_rnns_output shape: (1, N, nhid),
            # decoder_hidden shape: (nlayers*directions, N, nhid)
            decoder_rnns_output = self.decoder(decoder_rnns_output)
            # decoder_rnns_output shape: (1, N, ntok)
            if use_teacher_forcing:
                decoder_input = targets[seq_index].view(-1, batch_size)
            else:
                topv, topi = decoder_rnns_output.topk(1, dim=2)
                decoder_input = topi.view(1, batch_size).detach()

            decoder_rnns_output_list.append(decoder_rnns_output)
            decoder_rnns_h_list.append(decoder_hidden)
        decoder_rnns_output_tensor = torch.cat(tuple(decoder_rnns_output_list),
                                               dim=0)
        # decoder_rnns_output_tensor shape: (S, N, ntok)
        if not return_decoder_all_h:
            return decoder_rnns_output_tensor, decoder_hidden
        else:
            return decoder_rnns_output_tensor, decoder_hidden, \
                   decoder_rnns_h_list
    def forward(self,
                sentence,
                p_sentence,
                pos_tags,
                lengths,
                target_idx_in,
                region_marks,
                local_roles_voc,
                frames,
                local_roles_mask,
                sent_pred_lemmas_idx,
                dep_tags,
                dep_heads,
                targets,
                specific_dep_tags,
                specific_dep_relations,
                test=False):

        #contruct input for DEP
        embeds_DEP = self.word_embeddings_DEP(sentence)
        embeds_DEP = embeds_DEP.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)
        pos_embeds = self.pos_embeddings(pos_tags)
        pos_embeds_DEP = self.pos_embeddings_DEP(pos_tags)
        region_marks = region_marks.view(self.batch_size, len(sentence[0]), 1)
        fixed_embeds_DEP = self.word_fixed_embeddings_DEP(p_sentence)
        fixed_embeds_DEP = fixed_embeds_DEP.view(self.batch_size,
                                                 len(sentence[0]),
                                                 self.word_emb_dim)

        embeds_forDEP = torch.cat(
            (embeds_DEP, fixed_embeds_DEP, pos_embeds_DEP, region_marks), 2)
        embeds_forDEP = self.DEP_input_dropout(embeds_forDEP)

        #first layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            embeds_forDEP, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden = self.BiLSTM_0(embeds_sort, self.hidden)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_0 = hidden_states[unsort_idx]

        # second_layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            hidden_states_0, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_2 = self.BiLSTM_1(embeds_sort,
                                                     self.hidden_2)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        #hidden_states = hidden_states.transpose(0, 1)
        hidden_states_1 = hidden_states[unsort_idx]

        Label_composer = hidden_states_1
        predicate_embeds = Label_composer[np.arange(0,
                                                    Label_composer.size()[0]),
                                          target_idx_in]
        # T * B * H
        added_embeds = torch.zeros(Label_composer.size()[1],
                                   Label_composer.size()[0],
                                   Label_composer.size()[2]).to(device)
        concat_embeds = (added_embeds + predicate_embeds).transpose(0, 1)
        Label_features = torch.cat((Label_composer, concat_embeds), 2)

        dep_tag_space = self.MLP(
            self.label_dropout(F.tanh(self.hidden2tag(Label_features)))).view(
                len(sentence[0]) * self.batch_size, -1)

        dep_labels = torch.argmax(dep_tag_space, dim=1)

        if test:
            TagProbs_use = F.softmax(dep_tag_space,
                                     dim=1).view(self.batch_size,
                                                 len(sentence[0]), -1)
            TagProbs_noGrad = TagProbs_use.detach()
            h1 = F.relu(self.tag2hidden(TagProbs_noGrad))
        else:
            h1 = self.dep_embeddings(dep_tags).view(self.batch_size,
                                                    len(sentence[0]), -1)

        fixed_embeds = self.word_fixed_embeddings(p_sentence)
        fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]),
                                         self.word_emb_dim)
        sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx)
        embeds_SRL = self.word_embeddings_SRL(sentence)
        embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)

        SRL_hidden_states = torch.cat(
            (embeds_SRL, fixed_embeds, sent_pred_lemmas_embeds, pos_embeds,
             region_marks, h1), 2)
        SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states)

        # SRL layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            SRL_hidden_states, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort.cpu().numpy(),
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_4 = self.BiLSTM_SRL(embeds_sort,
                                                       self.hidden_4)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states = hidden_states[unsort_idx]
        hidden_states = self.hidden_state_dropout(hidden_states)

        # B * H
        hidden_states_3 = hidden_states
        predicate_embeds = hidden_states_3[
            np.arange(0,
                      hidden_states_3.size()[0]), target_idx_in]
        # T * B * H
        added_embeds = torch.zeros(hidden_states_3.size()[1],
                                   hidden_states_3.size()[0],
                                   hidden_states_3.size()[2]).to(device)
        predicate_embeds = added_embeds + predicate_embeds
        # B * T * H
        predicate_embeds = predicate_embeds.transpose(0, 1)
        hidden_states = torch.cat((hidden_states_3, predicate_embeds), 2)
        # print(hidden_states)
        # non-linear map and rectify the roles' embeddings
        # roles = Variable(torch.from_numpy(np.arange(0, self.tagset_size)))

        # B * roles
        # log(local_roles_voc)
        # log(frames)

        # B * roles * h
        role_embeds = self.role_embeddings(local_roles_voc)
        frame_embeds = self.frame_embeddings(frames)

        role_embeds = torch.cat((role_embeds, frame_embeds), 2)
        mapped_roles = F.relu(self.role_map(role_embeds))
        mapped_roles = torch.transpose(mapped_roles, 1, 2)

        # b, times, roles
        tag_space = torch.matmul(hidden_states, mapped_roles)
        #tag_space = hidden_states.mm(mapped_roles)

        # b, roles
        #sub = torch.div(torch.add(local_roles_mask, -1.0), _BIG_NUMBER)
        sub = torch.add(local_roles_mask, -1.0) * _BIG_NUMBER
        sub = torch.FloatTensor(sub.cpu().numpy()).to(device)
        # b, roles, times
        tag_space = torch.transpose(tag_space, 0, 1)
        tag_space += sub
        # b, T, roles
        tag_space = torch.transpose(tag_space, 0, 1)
        tag_space = tag_space.view(len(sentence[0]) * self.batch_size, -1)

        SRLprobs = F.softmax(tag_space, dim=1)

        goldLabelInd = dep_tags.view(-1).cpu().data.numpy()
        rscores = dep_tag_space.view(self.batch_size * len(sentence[0]),
                                     -1).cpu().data.numpy()
        rexprs = dep_tag_space.view(self.batch_size * len(sentence[0]), -1)
        lerrs = []
        # for every word in the batch
        for i in range(len(rscores)):
            if goldLabelInd[i] == 0:
                continue
            wrongLabelInd = \
                max(((l, scr) for l, scr in enumerate(rscores[i]) if l != goldLabelInd[i]), key=itemgetter(1))[0]
            if rscores[i][goldLabelInd[i]] < rscores[i][wrongLabelInd] + 1:
                lerrs += [
                    rexprs[i][wrongLabelInd] - rexprs[i][goldLabelInd[i]]
                ]
        # +++++++++++++++++++++++
        wrong_l_nums = 0.0
        all_l_nums = 0.0

        right_noNull_predict = 0.0
        noNull_predict = 0.0
        noNUll_truth = 0.0

        for predict_l, gold_l in zip(dep_labels,
                                     dep_tags.cpu().view(-1).data.numpy()):
            if predict_l > 1:
                noNull_predict += 1
            if gold_l != 0:
                all_l_nums += 1
                if gold_l != 1:
                    noNUll_truth += 1
                    if gold_l == predict_l:
                        right_noNull_predict += 1
            if predict_l != gold_l and gold_l != 0:
                wrong_l_nums += 1

        targets = targets.view(-1)

        loss_function = nn.CrossEntropyLoss(ignore_index=0)

        SRLloss = loss_function(tag_space, targets)
        #DEPloss = loss_function(dep_tag_space, dep_tags.view(-1))

        if len(lerrs) > 0:
            DEPloss = sum(lerrs)
            loss = SRLloss + DEPloss
        else:
            loss = SRLloss
        return SRLloss, DEPloss, DEPloss, loss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums,  \
               right_noNull_predict, noNull_predict, noNUll_truth,\
               right_noNull_predict, noNull_predict, noNUll_truth
Exemplo n.º 53
0
    def forward(self, features, labels=None, valid_lengths=None):
        assert (valid_lengths is not None), 'Valid_lengths is required.'
        # features from mockingjay: (batch_size, layer, seq_len, feature)
        # features from baseline: (batch_size, seq_len, feature)
        # labels: (batch_size,), one utterance to one label
        # valid_lengths: (batch_size, )
        batch_size = features.size(0)
        layer_num = features.size(1) if len(features.shape) == 4 else None
        seq_len = features.size(2) if len(
            features.shape) == 4 else features.size(1)
        feature_dim = features.size(3) if len(
            features.shape) == 4 else features.size(2)

        select_hidden = self.config['select_hidden']
        if len(features.shape) == 4:
            # compute mean on mockingjay representations if given features from mockingjay
            if select_hidden == 'last':
                features = features[:, -1, :, :]
            elif select_hidden == 'first':
                features = features[:, 0, :, :]
            elif select_hidden == 'average':
                features = features.mean(
                    dim=1
                )  # now simply average the representations over all layers, (batch_size, seq_len, feature)
            elif select_hidden == 'weighted_sum':
                features = features.transpose(0, 1).reshape(layer_num, -1)
                features = torch.matmul(self.weight[:layer_num],
                                        features).reshape(
                                            batch_size, seq_len, feature_dim)
            elif select_hidden == 'weighted_sum_norm':
                weights = nn.functional.softmax(self.weight[:layer_num],
                                                dim=-1)
                features = features.transpose(0, 1).reshape(layer_num, -1)
                features = torch.matmul(weights, features).reshape(
                    batch_size, seq_len, feature_dim)
            else:
                raise NotImplementedError(
                    'Feature selection mode not supported!')

        sample_rate = self.config['sample_rate']
        features = features[:, torch.arange(0, seq_len, sample_rate), :]
        valid_lengths /= sample_rate

        for linear in self.pre_linears:
            features = linear(features)
            features = self.act_fn(features)
            features = self.dropout(features)

        packed = pack_padded_sequence(features,
                                      valid_lengths,
                                      batch_first=True,
                                      enforce_sorted=True)
        _, h_n = self.rnn(packed)
        hidden = h_n[-1, :, :]
        # cause h_n directly contains info for final states
        # it will be easier to use h_n as extracted embedding

        for linear in self.post_linears:
            hidden = linear(hidden)
            hidden = self.act_fn(hidden)
            hidden = self.dropout(hidden)

        logits = self.out(hidden)

        mode = self.config['mode']
        if mode == 'classification':
            result = self.out_fn(logits)
            # result: (batch_size, class_num)
        elif mode == 'regression':
            result = logits.reshape(-1)
            # result: (batch_size, )

        if labels is not None:
            loss = self.criterion(result, labels)

            # statistic for accuracy
            if mode == 'classification':
                correct, valid = self.statistic(result, labels)
            elif mode == 'regression':
                # correct and valid has no meaning when in regression mode
                # just to make the outside wrapper can correctly function
                correct, valid = torch.LongTensor([1]), torch.LongTensor([1])

            return loss, result.detach().cpu(), correct, valid

        return result
Exemplo n.º 54
0
    def beam_search(self, src_sent: List[str], beam_size: int = 20, max_decoding_time_step: int = 70) -> List[
        Hypothesis]:
        """
        Given a single source sentence, perform beam search

        Args:
            src_sent: a single tokenized source sentence
            beam_size: beam size
            max_decoding_time_step: maximum number of time steps to unroll the decoding RNN

        Returns:
            hypotheses: a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """

        self.eou = 2
        top_k = 20
        batch_size = 1
        low_ind = [0]
        high_ind = []

        src_ind = torch.cuda.LongTensor(self.low_src_vocab.words2indices(src_sent[1]))
        src_embed = self.low_src_embed(src_ind).unsqueeze(0)
        
        src_lengths = np.asarray([len(src_sent[1])])
        packed_input = pack_padded_sequence(src_embed, src_lengths, batch_first=True)
        src_output, src_last_hidden = self.encoder(packed_input)
        low_src_output, low_src_last_hidden = self.low_encoder(packed_input)
        src_hidden, _ = pad_packed_sequence(src_output, batch_first=True)
        low_src_hidden, _ = pad_packed_sequence(low_src_output, batch_first=True)
        decoder_hidden = self.init_hidden((low_src_last_hidden, src_last_hidden), batch_size, low_ind, high_ind)

        eos_filler = torch.zeros(beam_size).long().cuda().fill_(self.eou)
        decoder_input = self.tgt_embed(torch.cuda.LongTensor([1])).unsqueeze(1)
        length = src_hidden.size(1)
        src_lengths = torch.cuda.LongTensor(src_lengths)

        q_key = self.q_key(src_hidden)
        q_value = self.q_value(src_hidden)
        q_spec_key = torch.zeros(batch_size, length, self.key_size).cuda()
        q_spec_key[low_ind, :, :] = self.q_low_key(low_src_hidden[low_ind, :, :])
        q_spec_value = torch.zeros(batch_size, length, self.embed_size).cuda()
        q_spec_value[low_ind, :, :] = self.q_low_value(low_src_hidden[low_ind, :, :])
        q_mask = torch.arange(length).long().cuda().repeat(src_hidden.size(0), 1) < torch.cuda.LongTensor(
            src_lengths).repeat(length, 1).transpose(0, 1)
        
        context = self.attention(decoder_hidden, q_key, q_value, q_spec_key, q_spec_value, q_mask)
        decoder_output, decoder_hidden = self.decoder(torch.cat((decoder_input, context), dim=2), decoder_hidden)
        decoder_output = torch.cat((decoder_output, context), dim=2)
        decoder_output = self.word_dist(F.tanh(self.out(decoder_output.squeeze(1))))
        decoder_output[:, 0] = -np.inf

        logprobs, argtop = torch.topk(F.log_softmax(decoder_output, dim=1), beam_size, dim=1)
        beam = torch.zeros(beam_size, max_decoding_time_step).long().cuda()
        beam[:, 0] = argtop
        beam_probs = logprobs.clone().squeeze(0)
        beam_eos = argtop.squeeze(0) == self.eou
        decoder_hidden = (decoder_hidden[0].expand(1, beam_size, self.hidden_size).contiguous(),
                          decoder_hidden[1].expand(1, beam_size, self.hidden_size).contiguous())
        decoder_input = self.tgt_embed(argtop.squeeze(0)).unsqueeze(1)

        src_hidden = src_hidden.expand(beam_size, length, self.hidden_size * 2)
        low_src_hidden = low_src_hidden.expand(beam_size, length, self.hidden_size * 2)
        q_key = self.q_key(src_hidden)
        q_value = self.q_value(src_hidden)
        q_spec_key = self.q_low_key(low_src_hidden)
        q_spec_value = self.q_low_value(low_src_hidden)
        q_mask = torch.arange(length).long().cuda().repeat(src_hidden.size(0), 1) < torch.cuda.LongTensor(
            src_lengths).repeat(length, 1).transpose(0, 1)

        for t in range(max_decoding_time_step - 1):
            context = self.attention(decoder_hidden, q_key, q_value, q_spec_key, q_spec_value, q_mask)
            decoder_output, decoder_hidden = self.decoder(torch.cat((decoder_input, context), dim=2).transpose(0, 1),
                                                          decoder_hidden)
            decoder_output = torch.cat((decoder_output.transpose(0, 1), context), dim=2)
            decoder_output = self.word_dist(F.tanh(self.out(decoder_output)))

            logprobs, argtop = torch.topk(F.log_softmax(decoder_output.squeeze(1), dim=1), top_k, dim=1)
            best_probs, best_args = (beam_probs.expand(top_k, beam_size).transpose(0, 1) + logprobs).view(-1).topk(
                beam_size)

            last = best_args / top_k
            curr = best_args % top_k
            beam[:, :] = beam[last, :]
            beam_eos = beam_eos[last]
            beam_probs = beam_probs[last]
            beam[:, t + 1] = argtop[last, curr] * (~beam_eos).long() + eos_filler * beam_eos.long()
            mask = ~beam_eos
            beam_probs[mask] = (beam_probs[mask] * (t + 1) + best_probs[mask]) / (t + 2)
            decoder_hidden = (decoder_hidden[0][:, last, :], decoder_hidden[1][:, last, :])

            beam_eos = beam_eos | (beam[:, t + 1] == self.eou)
            decoder_input = self.tgt_embed(beam[:, t + 1]).unsqueeze(1)

            if beam_eos.all():
                break

        best, best_arg = beam_probs.max(0)
        translation = beam[best_arg].cpu().tolist()
        if self.eou in translation:
            translation = translation[:translation.index(self.eou)]
        translation = [self.tgt_vocab.id2word[w] for w in translation]
        return [Hypothesis(value=translation, score=best.item())]
Exemplo n.º 55
0
    def featurize(self, batch, load_mask=True, load_frames=True):
        '''
        tensorize and pad batch input
        '''
        device = torch.device('cuda') if self.args.gpu else torch.device('cpu')
        feat = collections.defaultdict(list)

        for ex in batch:
            ###########
            # auxillary
            ###########

            if not self.test_mode:
                # subgoal completion supervision
                if self.args.subgoal_aux_loss_wt > 0:
                    feat['subgoals_completed'].append(
                        np.array(ex['num']['low_to_high_idx']) /
                        self.max_subgoals)

                # progress monitor supervision
                if self.args.pm_aux_loss_wt > 0:
                    num_actions = len(
                        [a for sg in ex['num']['action_low'] for a in sg])
                    subgoal_progress = [(i + 1) / float(num_actions)
                                        for i in range(num_actions)]
                    feat['subgoal_progress'].append(subgoal_progress)

            #########
            # inputs
            #########

            # serialize segments
            self.serialize_lang_action(ex)

            # goal and instr language
            lang_goal, lang_instr = ex['num']['lang_goal'], ex['num'][
                'lang_instr']

            # zero inputs if specified
            lang_goal = self.zero_input(
                lang_goal) if self.args.zero_goal else lang_goal
            lang_instr = self.zero_input(
                lang_instr) if self.args.zero_instr else lang_instr

            # append goal + instr
            lang_goal_instr = lang_goal + lang_instr
            feat['lang_goal_instr'].append(lang_goal_instr)

            # load Resnet features from disk
            if load_frames and not self.test_mode:
                root = self.get_task_root(ex)
                im = torch.load(os.path.join(root, self.feat_pt))
                keep = [None] * len(ex['plan']['low_actions'])
                for i, d in enumerate(ex['images']):
                    # only add frames linked with low-level actions (i.e. skip filler frames like smooth rotations and dish washing)
                    if keep[d['low_idx']] is None:
                        keep[d['low_idx']] = im[i]
                keep.append(keep[-1])  # stop frame
                feat['frames'].append(torch.stack(keep, dim=0))

            #########
            # outputs
            #########

            if not self.test_mode:
                # low-level action
                feat['action_low'].append(
                    [a['action'] for a in ex['num']['action_low']])

                # low-level action mask
                if load_mask:
                    feat['action_low_mask'].append([
                        self.decompress_mask(a['mask'])
                        for a in ex['num']['action_low']
                        if a['mask'] is not None
                    ])

                # low-level valid interact
                feat['action_low_valid_interact'].append(
                    [a['valid_interact'] for a in ex['num']['action_low']])

        # tensorization and padding
        for k, v in feat.items():
            if k in {'lang_goal_instr'}:
                # language embedding and padding
                seqs = [torch.tensor(vv, device=device) for vv in v]
                pad_seq = pad_sequence(seqs,
                                       batch_first=True,
                                       padding_value=self.pad)
                seq_lengths = np.array(list(map(len, v)))
                embed_seq = self.emb_word(pad_seq)
                packed_input = pack_padded_sequence(embed_seq,
                                                    seq_lengths,
                                                    batch_first=True,
                                                    enforce_sorted=False)
                feat[k] = packed_input
            elif k in {'action_low_mask'}:
                # mask padding
                seqs = [
                    torch.tensor(vv, device=device, dtype=torch.float)
                    for vv in v
                ]
                feat[k] = seqs
            elif k in {'subgoal_progress', 'subgoals_completed'}:
                # auxillary padding
                seqs = [
                    torch.tensor(vv, device=device, dtype=torch.float)
                    for vv in v
                ]
                pad_seq = pad_sequence(seqs,
                                       batch_first=True,
                                       padding_value=self.pad)
                feat[k] = pad_seq
            else:
                # default: tensorize and pad sequence
                seqs = [
                    torch.tensor(vv,
                                 device=device,
                                 dtype=torch.float if
                                 ('frames' in k) else torch.long) for vv in v
                ]
                pad_seq = pad_sequence(seqs,
                                       batch_first=True,
                                       padding_value=self.pad)
                feat[k] = pad_seq

        return feat
random.seed(44)

n_test = 1000
x_test = next(iter(get_loader(test_data, n_test)))
target_padded, sequence_lengths = pad_packed_sequence(x_test)
target_averages = []
for i, length in enumerate(sequence_lengths):
    arr = target_padded[:,i][:length-1].numpy()
    target_averages.append(arr.mean())
    
batch_sizes = x_test.batch_sizes

num_emph = 4
emph_index = sorted(random.choices(range(len(test_data)), k=num_emph))
emph_packed = pack_padded_sequence( target_padded[:,emph_index,:], sequence_lengths[emph_index])


rae, t_info_rae = get_trained_model(rae, model_name="ToyRAE", training_info=True)
vrae, t_info_vrae = get_trained_model(vrae, model_name="ToyVRAE", training_info=True)
iaf, t_info_iaf = get_trained_model(iaf, model_name="ToyVRAEIAF", training_info=True)

fig, axes =  plt.subplots(nrows=3, ncols=3, figsize=(14, 6.5))

# %% Recurrent Autoencoder
plt.figure()
plt.plot(t_info_rae["training_loss"])
plt.plot(t_info_rae["validation_loss"])
plt.savefig(figure_directory / "rae_toy_loss.pdf")

#%%
Exemplo n.º 57
0
    def _add_embeddings_internal(self, sentences: Union[List[Sentence],
                                                        Sentence]):
        """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update
        only if embeddings are non-static."""

        # TODO: remove in future versions
        if not hasattr(self, "locked_dropout"):
            self.locked_dropout = None
        if not hasattr(self, "word_dropout"):
            self.word_dropout = None

        if type(sentences) is Sentence:
            sentences = [sentences]

        self.rnn.zero_grad()

        # embed words in the sentence
        self.embeddings.embed(sentences)

        lengths: List[int] = [len(sentence.tokens) for sentence in sentences]
        longest_token_sequence_in_batch: int = max(lengths)

        pre_allocated_zero_tensor = torch.zeros(
            self.embeddings.embedding_length * longest_token_sequence_in_batch,
            dtype=torch.float,
            device=flair.device,
        )

        all_embs: List[torch.Tensor] = list()
        for sentence in sentences:
            all_embs += [
                emb for token in sentence
                for emb in token.get_each_embedding()
            ]
            nb_padding_tokens = longest_token_sequence_in_batch - len(sentence)

            if nb_padding_tokens > 0:
                t = pre_allocated_zero_tensor[:self.embeddings.
                                              embedding_length *
                                              nb_padding_tokens]
                all_embs.append(t)

        sentence_tensor = torch.cat(all_embs).view([
            len(sentences),
            longest_token_sequence_in_batch,
            self.embeddings.embedding_length,
        ])

        # before-RNN dropout
        if self.dropout:
            sentence_tensor = self.dropout(sentence_tensor)
        if self.locked_dropout:
            sentence_tensor = self.locked_dropout(sentence_tensor)
        if self.word_dropout:
            sentence_tensor = self.word_dropout(sentence_tensor)

        # reproject if set
        if self.reproject_words:
            sentence_tensor = self.word_reprojection_map(sentence_tensor)

        # push through RNN
        packed = pack_padded_sequence(sentence_tensor,
                                      lengths,
                                      enforce_sorted=False,
                                      batch_first=True)  # type: ignore
        rnn_out, hidden = self.rnn(packed)
        outputs, output_lengths = pad_packed_sequence(rnn_out,
                                                      batch_first=True)

        # after-RNN dropout
        if self.dropout:
            outputs = self.dropout(outputs)
        if self.locked_dropout:
            outputs = self.locked_dropout(outputs)

        # extract embeddings from RNN
        for sentence_no, length in enumerate(lengths):
            last_rep = outputs[sentence_no, length - 1]

            embedding = last_rep
            if self.bidirectional:
                first_rep = outputs[sentence_no, 0]
                embedding = torch.cat([first_rep, last_rep], 0)

            if self.static_embeddings:
                embedding = embedding.detach()

            sentence = sentences[sentence_no]
            sentence.set_embedding(self.name, embedding)
for epoch in range(epochs):
    epoch_loss = 0
    for i, (images, inputs, targets) in enumerate(dataloader, 0):
        # print(f"Batch = {i}, Time: {time.time() - start}, Loss: {epoch_loss}")

        images = Variable(images).cuda()
        images = extractor.forward(images)

        k = images.shape[0]
        images = torch.stack([images] * captions_per_image).permute(
            1, 0, 2).contiguous().view(-1, images.shape[-1])
        inputs = inputs.view(-1, max_length, inputs.shape[-1])
        targets = targets.view(-1, max_length)

        inputs = pack_padded_sequence(inputs[:, :-1],
                                      [max_length] * captions_per_image * k,
                                      True).cuda()
        targets = pack_padded_sequence(targets[:, 1:],
                                       [max_length] * captions_per_image * k,
                                       True).cuda()[0]

        optimizer.zero_grad()
        outputs = generator.forward(images, inputs)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    end = time.time()
    print(f"Epoch: {epoch}, Time: {end - start}, Loss: {epoch_loss}")
Exemplo n.º 59
0
    print(f'{n} sizes: {w.size()}')
#print(lstm._flat_weights_names)
#print(lstm._original_flat_names)
optimizer = optim.SGD(lstm.parameters(), lr=0.0001, momentum=0.9)
optimizer.zero_grad()

print(lstm)
input_size = 10
hidden_size = 6
num_layers = 2
seq_length = 7
batch = 6

x = torch.randn(batch, seq_length, input_size) # .to('cuda')
lengths = [7, 5, 5, 2, 1, 1]
x = rnn.pack_padded_sequence(x, lengths, batch_first=True)

# x = torch.rand(10, 1, 256) #.to('cuda')
# y = torch.rand(batch, seq_length, hidden_size * 2) # .to('cuda')
# y = rnn.pack_padded_sequence(y, lengths, batch_first=True).data

with torch.no_grad():
    cpu_out, _ = lstm(x)


lstm.to('cuda')
x = x.to('cuda')

with torch.no_grad():
   gpu_out, _ = lstm(x)
Exemplo n.º 60
0
    def step(self,
             x: Dict[str, torch.Tensor],
             y: torch.Tensor,
             batch_idx: int,
             label="train",
             **kwargs):
        """
        Run for each train/val step.
        """
        # pack y sequence if different encoder lengths exist
        if (x["decoder_lengths"] < x["decoder_lengths"].max()).any():
            y = rnn.pack_padded_sequence(y,
                                         lengths=x["decoder_lengths"],
                                         batch_first=True,
                                         enforce_sorted=False)

        if label == "train" and len(self.hparams.monotone_constaints) > 0:
            # calculate gradient with respect to continous decoder features
            x["decoder_cont"].requires_grad_(True)
            assert not torch._C._get_cudnn_enabled(), (
                "To use monotone constraints, wrap model and training in context "
                "`torch.backends.cudnn.flags(enable=False)`")
            out = self(x, **kwargs)
            out["prediction"] = self.transform_output(out)
            prediction = out["prediction"]

            gradient = torch.autograd.grad(
                outputs=prediction,
                inputs=x["decoder_cont"],
                grad_outputs=torch.ones_like(prediction),  # t
                create_graph=True,  # allows usage in graph
                allow_unused=True,
            )[0]

            # select relevant features
            indices = torch.tensor([
                self.hparams.x_reals.index(name)
                for name in self.hparams.monotone_constaints.keys()
            ])
            monotonicity = torch.tensor(
                [val for val in self.hparams.monotone_constaints.values()],
                dtype=gradient.dtype,
                device=gradient.device)
            # add additionl loss if gradient points in wrong direction
            gradient = gradient[..., indices] * monotonicity[None, None]
            monotinicity_loss = gradient.clamp_max(0).mean()
            # multiply monotinicity loss by large number to ensure relevance and take to the power of 2
            # for smoothness of loss function
            monotinicity_loss = 10 * torch.pow(monotinicity_loss, 2)
            if isinstance(self.loss, MASE):
                loss = self.loss(prediction,
                                 y,
                                 encoder_target=x["encoder_target"],
                                 encoder_lengths=x["encoder_lengths"])
            else:
                loss = self.loss(prediction, y)

            loss = loss * (1 + monotinicity_loss)
        else:
            out = self(x, **kwargs)
            out["prediction"] = self.transform_output(out)

            # calculate loss
            prediction = out["prediction"]
            if isinstance(self.loss, MASE):
                loss = self.loss(prediction,
                                 y,
                                 encoder_target=x["encoder_target"],
                                 encoder_lengths=x["encoder_lengths"])
            else:
                loss = self.loss(prediction, y)

        # log
        self._log_metrics(x, y, out, label=label)
        if self.log_interval(label == "train") > 0:
            self._log_prediction(x, out, batch_idx, label=label)
        log = {"loss": loss, "n_samples": x["decoder_lengths"].size(0)}

        return log, out