예제 #1
0
def base_plus_copy_indices(words, dynamic_vocabs, base_vocab, volatile=False):
    """Compute base + copy indices.
    
    Args:
        words (list[list[unicode]])
        dynamic_vocabs (list[HardCopyDynamicVocab])
        base_vocab (HardCopyVocab)
        volatile (bool)

    Returns:
        MultiVocabIndices
    """
    unk = base_vocab.UNK
    copy_seqs = []
    for seq, dyna_vocab in izip(words, dynamic_vocabs):
        word_to_copy = dyna_vocab.word_to_copy_token
        normal_copy_seq = []
        for w in seq:
            normal_copy_seq.append(word_to_copy.get(w, unk))
        copy_seqs.append(normal_copy_seq)

    # each SeqBatch.values has shape (batch_size, seq_length)
    base_indices = SequenceBatch.from_sequences(words, base_vocab, volatile=volatile)
    copy_indices = SequenceBatch.from_sequences(copy_seqs, base_vocab, volatile=volatile)

    assert_tensor_equal(base_indices.mask, copy_indices.mask)

    # has shape (batch_size, seq_length, 2)
    concat_values = torch.stack([base_indices.values, copy_indices.values], 2)

    return MultiVocabIndices(concat_values, base_indices.mask)
예제 #2
0
    def preprocess(self, source_words, insert_words, insert_exact_words,
                   delete_words, delete_exact_words, edit_embed):
        """Preprocess.

        Args:
            source_words (list[list[unicode]]): a batch of source sequences
            insert_words (list[list[unicode]]): a batch of insert words
            insert_exact_words (list[list[unicode]]): a batch of insert words, used without noise
            delete_words (list[list[unicode]]): a batch of delete words
            delete_exact_words (list[list[unicode]]): a batch of delete words, used without noise
            edit_embed (np.ndarray | None): of shape (batch_size, edit_dim), or None.

        Returns:
            EncoderInput
        """
        return EncoderInput(
            SequenceBatch.from_sequences(source_words, self.word_vocab),
            SequenceBatch.from_sequences(insert_words,
                                         self.word_vocab,
                                         min_seq_length=1),
            SequenceBatch.from_sequences(insert_exact_words,
                                         self.word_vocab,
                                         min_seq_length=1),
            SequenceBatch.from_sequences(delete_words,
                                         self.word_vocab,
                                         min_seq_length=1),
            SequenceBatch.from_sequences(delete_exact_words,
                                         self.word_vocab,
                                         min_seq_length=1), edit_embed)
예제 #3
0
    def __init__(self, target_words, word_vocab, keep_rate):

        input_words = [[word_vocab.START] + tokens for tokens in target_words]
        target_words_shifted = [tokens + [word_vocab.STOP] for tokens in target_words]

        input_words = SequenceBatch.from_sequences(input_words, word_vocab)
        self.input_words = self._drop_seq_batch(input_words, word_vocab, keep_rate)
        self.target_words = SequenceBatch.from_sequences(target_words_shifted, 
                                                         word_vocab)
예제 #4
0
    def __init__(self, target_words, word_vocab):
        """Create TrainDecoderInput.
        
        Args:
            target_words (list[list[unicode]])
            word_vocab (WordVocab)
        """
        input_words = [[word_vocab.START] + tokens for tokens in target_words]  # prepend with <start> token
        target_words_shifted = [tokens + [word_vocab.STOP] for tokens in target_words]  # append with <stop> token

        self.input_words = SequenceBatch.from_sequences(input_words, word_vocab)
        self.target_words = SequenceBatch.from_sequences(target_words_shifted, word_vocab)
예제 #5
0
    def _get_neighbor_indices(self, dom_elements, is_neighbor):
        """Compute neighbor indices.

        Args:
            dom_elements (list[DOMElement]): may include PAD elements
            is_neighbor (Callable: DOMElement x DOMElement --> bool): True if
                two DOM elements are neighbors of each other, otherwise False

        Returns:
            SequenceBatch: of shape (total_dom_elems, max_neighbors)
        """
        dom_element_ids = [id(e) for e in flatten(dom_elements)]
        dom_element_ids_set = set(dom_element_ids)
        vocab = SuperSimpleVocab(dom_element_ids)

        neighbors_batch = []
        for dom_batch in dom_elements:
            for dom_elem in dom_batch:
                # Optimization: no DOM PAD has neighbors
                if isinstance(dom_elem, DOMElementPAD):
                    neighbors = []
                else:
                    neighbors = []
                    for neighbor in dom_batch:
                        if is_neighbor(dom_elem, neighbor):
                            neighbors.append(id(neighbor))

                neighbors_batch.append(neighbors)

        neighbor_indices = SequenceBatch.from_sequences(neighbors_batch,
                                                        vocab,
                                                        min_seq_length=1)
        return neighbor_indices
예제 #6
0
    def test_embed(self):
        sequences = [
            [],
            [1, 2, 3],
            [3, 3],
            [2]
        ]

        vocab = SimpleVocab([0, 1, 2, 3, 4])
        indices = SequenceBatch.from_sequences(sequences, vocab)

        embeds = GPUVariable(torch.FloatTensor([
            [0, 0],
            [2, 2],   # 1
            [3, 4],   # 2
            [-10, 1], # 3
            [11, -1]  # 4
        ]))

        embedded = SequenceBatch.embed(indices, embeds)

        correct = np.array([
            [[0, 0], [0, 0], [0, 0]],
            [[2, 2], [3, 4], [-10, 1]],
            [[-10, 1], [-10, 1], [0, 0]],
            [[3, 4], [0, 0], [0, 0]]
        ], dtype=np.float32)
        assert_tensor_equal(embedded.values, correct)
예제 #7
0
    def forward(self, utterances):
        """Embeds a batch of utterances.

        Args:
            utterances (list[list[unicode]]): list[unicode] is a list of tokens
            forming a sentence. list[list[unicode]] is batch of sentences.

        Returns:
            Variable[FloatTensor]: batch x lstm_dim
                (concatenated first and last hidden states)
        """
        # Cut to max_words + look up indices
        utterances = [
            utterance[:self._max_words] + [EOS] for utterance in utterances
        ]
        token_indices = SequenceBatch.from_sequences(
            utterances, self._token_embedder.vocab)
        # batch x seq_len x token_embed_dim
        token_embeds = self._token_embedder.embed_seq_batch(token_indices)
        # print('token_embeds', token_embeds)
        bi_hidden_states = self._bilstm(token_embeds.split())
        final_states = torch.cat(bi_hidden_states.final_states, 1)

        hidden_states = SequenceBatch.cat(bi_hidden_states.combined_states)
        return self._attention(hidden_states, final_states).context
예제 #8
0
    def forward(self, utterance):
        """Embeds a batch of utterances.

        Args:
            utterance (list[list[unicode]]): list[unicode] is a list of tokens
            forming a sentence. list[list[unicode]] is batch of sentences.

        Returns:
            Variable[FloatTensor]: batch x lstm_dim
                (concatenated first and last hidden states)
            list[SequenceBatchElement]: list of length batch, where each
                element's values is seq_len x embed_dim and mask is seq_len,
                representing the hidden states of each token.
        """
        # Make keys hashable
        utterance = [tuple(utt) for utt in utterance]

        uncached_utterances = self._cache.uncached_keys(utterance)

        # Cache the uncached utterances
        if len(uncached_utterances) > 0:
            token_indices = SequenceBatch.from_sequences(
                    uncached_utterances, self._token_embedder.vocab)
            # batch x seq_len x token_embed_dim
            token_embeds = self._token_embedder.embed_seq_batch(token_indices)

            bi_hidden_states = self._bilstm(token_embeds.split())
            final_states = torch.cat(bi_hidden_states.final_states, 1)

            # Store the combined states in batch x stuff order for caching.
            combined_states = bi_hidden_states.combined_states
            # batch x seq_len x embed_dim
            combined_values = torch.stack(
                    [state.values for state in combined_states], 1)
            # batch x seq_len
            combined_masks = torch.stack(
                    [state.mask for state in combined_states], 1)
            assert len(combined_values) == len(combined_masks)
            combined_states_by_batch = [SequenceBatchElement(
                value, mask) for value, mask in zip(
                    combined_values, combined_masks)]

            assert len(final_states) == len(combined_states_by_batch)
            # self._cache.cache(
            #     uncached_utterances,
            #     zip(final_states, combined_states_by_batch))
            self._cache.cache(
                list(uncached_utterances),
                list(zip(final_states, combined_states_by_batch)))

        final_states, combined_states = zip(*self._cache.get(utterance))
        return torch.stack(final_states, 0), combined_states
예제 #9
0
def embed_tokens(token_embedder, max_words, node_texts):
    """
    Args:
        token_embedder (TokenEmbedder)
        max_words (int)
        node_texts (List[str])
    """
    # Cut to max_words + look up indices
    texts = [text[:max_words-1] + [EOS] for text in node_texts]
    token_indices = SequenceBatch.from_sequences(texts, token_embedder.vocab, min_seq_length=max_words)
    # batch x seq_len x token_embed_dim
    token_embeds = token_embedder.embed_seq_batch(token_indices)
    return token_embeds
예제 #10
0
    def input_embeds_list(self):
        sequences = [
            [1, 2, 3],
            [8, 4, 2, 1, 1],
            [],
        ]

        # token 1 maps to embedding [1], 2 maps to [2] and so on...
        vocab = SimpleVocab([1, 2, 3, 4, 5, 6, 7, 8])
        array = np.expand_dims(np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32), 1)
        token_embedder = TokenEmbedder(Bunch(vocab=vocab, array=array))

        seq_embeds = token_embedder.embed_seq_batch(SequenceBatch.from_sequences(sequences, vocab))
        return seq_embeds.split()
예제 #11
0
    def test_from_sequences(self, sequences, vocab):
        seq_batch = SequenceBatch.from_sequences(sequences, vocab)

        assert_tensor_equal(seq_batch.values,
                            np.array([
                                [1, 2, 2, 3],
                                [3, 0, 0, 0],
                                [0, 0, 0, 0],
                            ], dtype=np.int32))

        assert_tensor_equal(seq_batch.mask,
                            np.array([
                                [1, 1, 1, 1],
                                [1, 0, 0, 0],
                                [0, 0, 0, 0],
                            ], dtype=np.float32))
예제 #12
0
    def embed(self, sequences):
        for seq in sequences:
            if len(seq) == 0:
                raise ValueError("Cannot embed empty sequence.")

        token_indices = SequenceBatch.from_sequences(sequences, self.vocab, min_seq_length=1)
        token_embeds = self.token_embedder.embed_seq_batch(token_indices)  # SequenceBatch of size (batch_size, max_seq_length, word_dim)
        if self.pool == 'sum':
            pooled_token_embeds = SequenceBatch.reduce_sum(token_embeds)  # (batch_size, word_dim)
        elif self.pool == 'mean':
            pooled_token_embeds = SequenceBatch.reduce_mean(token_embeds)  # (batch_size, word_dim)
        elif self.pool == 'max':
            pooled_token_embeds = SequenceBatch.reduce_max(token_embeds)  # (batch_size, word_dim)
        else:
            raise ValueError(self.pool)

        seq_embeds = self.transform(pooled_token_embeds)  # (batch_size, embed_dim)
        assert seq_embeds.size()[1] == self.embed_dim

        return seq_embeds
예제 #13
0
    def forward(self, utterances):
        """Embeds an utterances.

        Args:
            utterances (list[list[str]]): list[str] is a list of tokens
            forming a sentence. list[list[str]] is batch of sentences.

        Returns:
            Tensor: batch x word_embed_dim (average of word vectors)
        """
        # Cut to max_words + look up indices
        utterances = [
            utterance[:self._max_words] + [EOS] for utterance in utterances
        ]
        token_indices = SequenceBatch.from_sequences(
            utterances, self._token_embedder.vocab)
        # batch x seq_len x token_embed_dim
        token_embeds = self._token_embedder.embed_seq_batch(token_indices)
        # batch x token_embed_dim
        averaged = SequenceBatch.reduce_mean(token_embeds)
        return averaged
예제 #14
0
def base_plus_copy_indices(words, dynamic_vocabs, base_vocab, volatile=False):
    """Compute base + copy indices.
    
    Args:
        words (list[list[unicode]])
        dynamic_vocabs (list[HardCopyDynamicVocab])
        base_vocab (HardCopyVocab)
        volatile (bool)

    Returns:
        MultiVocabIndices
    """
    unk = base_vocab.UNK
    copy_seqs = []
    for seq, dyna_vocab in izip(words, dynamic_vocabs):
        word_to_copy = dyna_vocab.word_to_copy_token
        normal_copy_seq = []
        try:
            for w in seq:
                normal_copy_seq.append(word_to_copy.get(w, unk))
        except Exception as e:
            print e
            seq = flat(seq)
            for w in seq:
                normal_copy_seq.append(word_to_copy.get(w, unk))

        copy_seqs.append(normal_copy_seq)

    # each SeqBatch.values has shape (batch_size, seq_length)
    base_indices = SequenceBatch.from_sequences(words,
                                                base_vocab,
                                                volatile=volatile)
    copy_indices = SequenceBatch.from_sequences(copy_seqs,
                                                base_vocab,
                                                volatile=volatile)

    try:
        assert_tensor_equal(base_indices.mask, copy_indices.mask)
    except Exception as e:
        diff = max(
            base_indices.mask.size(1) - copy_indices.mask.size(1),
            copy_indices.mask.size(1) - base_indices.mask.size(1))
        if base_indices.mask.size(1) < copy_indices.mask.size(1):
            zer = torch.zeros((base_indices.mask.size(0), diff),
                              dtype=torch.long)

            base_indices = SequenceBatch(
                torch.cat((base_indices.values,
                           torch.zeros((base_indices.values.size(0), diff),
                                       dtype=base_indices.values.dtype,
                                       device=base_indices.values.device)),
                          dim=1),
                torch.cat((base_indices.mask,
                           torch.zeros((base_indices.mask.size(0), diff),
                                       dtype=base_indices.mask.dtype,
                                       device=base_indices.mask.device)),
                          dim=1))
        else:
            zer = torch.zeros((copy_indices.mask.size(0), diff),
                              dtype=torch.long)

            copy_indices = SequenceBatch(
                torch.cat((copy_indices.values,
                           torch.zeros((copy_indices.values.size(0), diff),
                                       dtype=copy_indices.values.dtype,
                                       device=copy_indices.values.device)),
                          dim=1),
                torch.cat((copy_indices.mask,
                           torch.zeros((copy_indices.mask.size(0), diff),
                                       dtype=copy_indices.mask.dtype,
                                       device=copy_indices.mask.device)),
                          dim=1))

    # has shape (batch_size, seq_length, 2)
    concat_values = torch.stack([base_indices.values, copy_indices.values], 2)

    return MultiVocabIndices(concat_values, base_indices.mask)
예제 #15
0
    def preprocess(self, examples):

        return SequenceBatch.from_sequences(examples, self.word_vocab)
예제 #16
0
 def embed(self, sequences):
     sequence_indices = SequenceBatch.from_sequences(sequences, self.vocab, min_seq_length=1)  # (batch_size, max_seq_length)
     sequence_embeds = self.token_embedder.embed_seq_batch(sequence_indices)  # (batch_size, max_seq_length, word_dim)
     sequence_embeds_list = sequence_embeds.split()
     hidden_states_list = self.source_encoder(sequence_embeds_list)
     return hidden_states_list[-1].values  # (batch_size, embed_dim)
예제 #17
0
 def test_min_seq_length(self, vocab):
     seq_batch = SequenceBatch.from_sequences([[], [], []], vocab, min_seq_length=2)
     assert_tensor_equal(seq_batch.values, np.zeros((3, 2)))
     assert_tensor_equal(seq_batch.mask, np.zeros((3, 2)))