Exemplo n.º 1
0
    def test(self):
        token_vocab = SimpleVocab(u'a b c d'.split())
        sequences = [
            ['a', 'b', 'c', 'd'],
            ['c', 'd'],
        ]

        correct_embeds = np.array([
            [1, 2, 0, 3, 4, 1, 5, 6, 0, 7, 8, 1],
            [5, 6, 0, 7, 8, 1, 0, 0, 0, 0, 0, 0],
        ],
                                  dtype=np.float32)

        with clean_session():
            token_embeds = tf.constant([
                [1, 2, 0],
                [3, 4, 1],
                [5, 6, 0],
                [7, 8, 1],
            ],
                                       dtype=tf.float32)
            model = ConcatSequenceEmbedder(token_embeds)
            test_embeds = model.compute(model.embeds, sequences, token_vocab)

        assert_array_almost_equal(correct_embeds, test_embeds, decimal=5)
Exemplo n.º 2
0
    def test(self):
        token_vocab = SimpleVocab(u'a b c d'.split())
        sequences = [
            ['a', 'b', 'c', 'd'],
            ['c', 'd'],
        ]

        correct_embeds = np.array(
            [[3, 4, 1, 5, 6, 0, 7, 8, 1], [0, 0, 0, 5, 6, 0, 7, 8, 1]],
            dtype=np.float32)

        with clean_session():
            token_embeds = tf.constant([
                [1, 2, 0],
                [3, 4, 1],
                [5, 6, 0],
                [7, 8, 1],
            ],
                                       dtype=tf.float32)
            model = ConcatSequenceEmbedder(token_embeds,
                                           seq_length=3,
                                           align='right')
            test_embeds = model.compute(model.embeds, sequences, token_vocab)

            # check that static shape inference works
            assert model.embeds.get_shape().as_list() == [None, 3 * 3]

        assert_array_almost_equal(correct_embeds, test_embeds, decimal=5)
Exemplo n.º 3
0
    def test_embed(self):
        sequences = [
            [],
            [1, 2, 3],
            [3, 3],
            [2]
        ]

        vocab = SimpleVocab([0, 1, 2, 3, 4])
        indices = SequenceBatch.from_sequences(sequences, vocab)

        embeds = GPUVariable(torch.FloatTensor([
            [0, 0],
            [2, 2],   # 1
            [3, 4],   # 2
            [-10, 1], # 3
            [11, -1]  # 4
        ]))

        embedded = SequenceBatch.embed(indices, embeds)

        correct = np.array([
            [[0, 0], [0, 0], [0, 0]],
            [[2, 2], [3, 4], [-10, 1]],
            [[-10, 1], [-10, 1], [0, 0]],
            [[3, 4], [0, 0], [0, 0]]
        ], dtype=np.float32)
        assert_tensor_equal(embedded.values, correct)
Exemplo n.º 4
0
 def inputs(self):
     token_vocab = SimpleVocab(['<pad>'] + u'a b c d'.split())
     sequences = [
         ['a', 'c'],
         ['b', 'c', 'c'],
         ['d', 'c', 'a'],
     ]
     return self.as_args_kwargs(sequences, token_vocab)
Exemplo n.º 5
0
    def test_lstm(self):
        """Test whether the mask works properly for LSTM embedder."""
        token_vocab = SimpleVocab(u'a b c d'.split())
        sequences = [
            ['a', 'b', 'c', 'd'],
            ['c', 'd'],
            ['a', 'b', 'c', 'd'],
        ]
        sequences_alt = [
            ['a', 'b', 'c', 'd', 'a', 'b', 'd', 'c'],
            ['b', 'a', 'd'],
            ['c', 'd'],
        ]

        with clean_session():
            token_embeds = tf.constant([
                [1, 2, 0],
                [3, 4, 1],
                [5, 6, 0],
                [7, 8, 1],
            ],
                                       dtype=tf.float32)

            model = LSTMSequenceEmbedder(token_embeds,
                                         seq_length=4,
                                         hidden_size=7)
            test_embeds, test_hidden_states = model.compute(
                [model.embeds, model.hidden_states.values], sequences,
                token_vocab)
            assert test_embeds.shape == (3, 7)
            assert test_hidden_states.shape == (3, 4, 7)
            # Padded spaces should have the same hidden states
            assert_array_almost_equal(test_hidden_states[1, 1, :],
                                      test_hidden_states[1, 2, :],
                                      decimal=5)
            assert_array_almost_equal(test_hidden_states[1, 1, :],
                                      test_hidden_states[1, 3, :],
                                      decimal=5)

            # Try again but with different paddings
            # Should get the same result for ['c', 'd']
            big_model = LSTMSequenceEmbedder(token_embeds,
                                             seq_length=8,
                                             hidden_size=7)
            big_model.weights = model.weights  # match weights

            test_embeds_alt, test_hidden_states_alt = big_model.compute(
                [big_model.embeds, big_model.hidden_states.values],
                sequences_alt, token_vocab)
            assert test_embeds_alt.shape == (3, 7)
            assert test_hidden_states_alt.shape == (3, 8, 7)

        assert_array_almost_equal(test_embeds[1, :],
                                  test_embeds_alt[2, :],
                                  decimal=5)
        assert_array_almost_equal(test_hidden_states[1, :2, :],
                                  test_hidden_states_alt[2, :2, :],
                                  decimal=5)
 def base_pred_embeddings(self):
     array = np.array([
         [0, 0, 0, 0],
         [1, 2, 3, 4],
         [0, 2, 0, 8],
     ],
                      dtype=np.float32)
     vocab = SimpleVocab(u'<unk> b0 b1'.split())
     return SimpleEmbeddings(array, vocab)
Exemplo n.º 7
0
 def model(self):
     array = np.array([
         [1, 2, 3],
         [2, 4, 6],
         [3, 5, 7],
     ], dtype=np.float32)
     vocab = SimpleVocab(u'a b c'.split())
     embeddings = SimpleEmbeddings(array, vocab)
     return TokenEmbedder(embeddings, 'token_embeds')
Exemplo n.º 8
0
    def test_lstm(self):
        """Test whether the mask works properly for bidi LSTM embedder."""
        token_vocab = SimpleVocab('a b c d'.split())
        sequences = [
            ['a', 'b', 'c', 'd'],
            ['c', 'd'],
            ['a', 'b', 'c', 'd'],
        ]
        sequences_alt = [
            ['a', 'b', 'c', 'd', 'a', 'b', 'd', 'c'],
            ['b', 'a', 'd'],
            ['c', 'd'],
        ]

        with clean_session():
            token_embeds = tf.constant([
                [1, 2, 0],
                [3, 4, 1],
                [5, 6, 0],
                [7, 8, 1],
            ], dtype=tf.float32)

            model = BidiLSTMSequenceEmbedder(token_embeds, seq_length=4, hidden_size=7)
            test_embeds, test_hidden_states = model.compute(
                    [model.embeds, model.hidden_states.values],
                    sequences, token_vocab)
            assert test_embeds.shape == (3, 14)
            assert test_hidden_states.shape == (3, 4, 14)
            assert_array_almost_equal(test_embeds[1,:7], test_hidden_states[1,1,:7], decimal=5)
            assert_array_almost_equal(test_embeds[1,7:], test_hidden_states[1,0,7:], decimal=5)
            # Padded spaces should have the same forward embeddings
            assert_array_almost_equal(test_hidden_states[1,1,:7], test_hidden_states[1,2,:7], decimal=5)
            assert_array_almost_equal(test_hidden_states[1,1,:7], test_hidden_states[1,3,:7], decimal=5)
            # Padded spaces should have 0 backward embeddings
            assert_array_almost_equal(np.zeros((7,)), test_hidden_states[1,2,7:], decimal=5)
            assert_array_almost_equal(np.zeros((7,)), test_hidden_states[1,3,7:], decimal=5)
            # Other spaces should not have 0 embeddings with very high probability
            assert np.linalg.norm(test_hidden_states[1,0,:7]) > 1e-5
            assert np.linalg.norm(test_hidden_states[1,1,:7]) > 1e-5
            assert np.linalg.norm(test_hidden_states[1,0,7:]) > 1e-5
            assert np.linalg.norm(test_hidden_states[1,1,7:]) > 1e-5

            # Try again but with different paddings
            # Should get the same result for ['c', 'd']
            big_model = BidiLSTMSequenceEmbedder(token_embeds, seq_length=8, hidden_size=7)
            big_model.weights = model.weights  # match weights

            test_embeds_alt, test_hidden_states_alt = big_model.compute(
                    [big_model.embeds, big_model.hidden_states.values],
                    sequences_alt, token_vocab)
            assert test_embeds_alt.shape == (3, 14)
            assert test_hidden_states_alt.shape == (3, 8, 14)

        assert_array_almost_equal(test_embeds[1,:], test_embeds_alt[2,:], decimal=5)
        assert_array_almost_equal(test_hidden_states[1,:2,:],
                test_hidden_states_alt[2,:2,:], decimal=5)
    def __init__(self, embed_dim):
        OBJECT = 'object'
        LIST = 'list'

        tokens = [
            OBJECT,
            LIST,
            'r',
            'y',
            'g',
            'o',
            'p',
            'b',
            'e',  # 7 colors
            'color-na',  # if an Alchemy beaker is empty or has multiple colors
            # TODO(kelvin): change the behavior of RLongAlchemyObject.color to return `color-na`
            0,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,  # 0 index is used to represent things that are not visible
            -1,
            'X1/1',
            '0',
            '1',
            '2',
            '3',
            '4',  # Shapes!
        ]
        vocab = SimpleVocab(tokens)
        vocab.OBJECT = OBJECT
        vocab.LIST = LIST

        array = emulate_distribution((len(vocab), embed_dim),
                                     GloveEmbeddings(5000).array,
                                     seed=3)
        super(RLongPrimitiveEmbeddings, self).__init__(array, vocab)
Exemplo n.º 10
0
    def test_no_sequences(self):
        vocab = SimpleVocab('a b c'.split())
        sequences = []

        with clean_session():
            model = FeedSequenceBatch()
            indices = tf.identity(model.values)
            mask = tf.identity(model.mask)
            indices_val, mask_val = model.compute([indices, mask], sequences, vocab)
            assert indices_val.shape == mask_val.shape == (0, 0)
    def __init__(self, tokens, embeds):
        """

        Args:
            tokens (list[unicode])
            embeds (np.array)
        """
        self.vocab = SimpleVocab(tokens)
        self._embeds = tf.constant(embeds, dtype=tf.float32)
        self._embed_dim = embeds.shape[1]
Exemplo n.º 12
0
    def __init__(self, embed_dim):
        OBJECT = 'object'
        LIST = 'list'

        tokens = [
            OBJECT, LIST,
            'r', 'y', 'g', 'o', 'p', 'b', 'e',  # 7 colors
            'color-na',  # if an Alchemy beaker is empty or has multiple colors
            # TODO(kelvin): change the behavior of RLongAlchemyObject.color to return `color-na`
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  # 0 index is used to represent things that are not visible
            -1,
            'X1/1',
            '0', '1', '2', '3', '4',  # Shapes!
        ]
        vocab = SimpleVocab(tokens)
        vocab.OBJECT = OBJECT
        vocab.LIST = LIST

        array = emulate_distribution((len(vocab), embed_dim), GloveEmbeddings(5000).array, seed=3)
        super(RLongPrimitiveEmbeddings, self).__init__(array, vocab)
Exemplo n.º 13
0
    def embeddings(self):
        array = np.array([
            [0, 1, 2],
            [3, 4, 5],
            [6, 7, 8],
            [9, 10, 11],
            [12, 13, 14],
            [15, 16, 17],
        ], dtype=np.float32)

        vocab = SimpleVocab(['<pad>', 'a', 'b', 'c', 'd', 'e'])
        return SimpleEmbeddings(array, vocab)
Exemplo n.º 14
0
    def test_multi_vocab_indices(self):
        vocabs = [
            [SimpleVocab('a b c d e'.split()), SimpleVocab('x y z'.split())],
            [SimpleVocab('e d c b a'.split()), SimpleVocab('y z x'.split())],
        ]

        sequences = [
            'a b a e'.split(),
            'y y y x z'.split(),
        ]

        indices = SequenceBatch.multi_vocab_indices(sequences, vocabs)

        assert_tensor_equal(indices.values, [
            [[0, 4], [1, 3], [0, 4], [4, 0], [0, 0]],
            [[1, 0], [1, 0], [1, 0], [0, 2], [2, 1]],
        ])

        assert_tensor_equal(indices.mask, [
            [1, 1, 1, 1, 0],
            [1, 1, 1, 1, 1],
        ])
Exemplo n.º 15
0
    def input_embeds_list(self):
        sequences = [
            [1, 2, 3],
            [8, 4, 2, 1, 1],
            [],
        ]

        # token 1 maps to embedding [1], 2 maps to [2] and so on...
        vocab = SimpleVocab([1, 2, 3, 4, 5, 6, 7, 8])
        array = np.expand_dims(np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32), 1)
        token_embedder = TokenEmbedder(Bunch(vocab=vocab, array=array))

        seq_embeds = token_embedder.embed_seq_batch(SequenceBatch.from_sequences(sequences, vocab))
        return seq_embeds.split()
 def __init__(self, embed_dim, all_types):
     vocab = SimpleVocab(all_types)
     array = emulate_distribution((len(vocab), embed_dim),
                                  GloveEmbeddings(5000).array,
                                  seed=1)
     super(TypeEmbeddings, self).__init__(array, vocab)
Exemplo n.º 17
0
 def embedder(self, request):
     vocab = SimpleVocab(['<unk>', '<start>', '<stop>'] + ['a', 'b', 'c'])
     arr = np.eye(len(vocab), dtype=np.float32)
     word_embeddings = Bunch(vocab=vocab, array=arr)
     return TokenEmbedder(word_embeddings, trainable=request.param)
Exemplo n.º 18
0
def vocab():
    return SimpleVocab(['a', 'b', 'c'])
Exemplo n.º 19
0
 def test_save_load(self, vocab, tmpdir):
     path = str(tmpdir.join('vocab.txt'))
     vocab.save(path)
     new_vocab = SimpleVocab.load(path)
     assert vocab == new_vocab
Exemplo n.º 20
0
 def __init__(self, embed_dim):
     bool_vocab = SimpleVocab([True, False])
     embed_matrix = np.random.uniform(
             -np.sqrt(3. / embed_dim), np.sqrt(3. / embed_dim),
             size=(len(bool_vocab), embed_dim)).astype(np.float32)
     super(BoolEmbeddings, self).__init__(embed_matrix, bool_vocab)
Exemplo n.º 21
0
 def vocab(self):
     return SimpleVocab(['<unk>', 'a', 'b', 'c', '<start>', '<stop>'])
 def utterances(self):
     tokens = sorted(list(self._utterance_set))
     return SimpleVocab(tokens)
Exemplo n.º 23
0
 def test_save_load(self, vocab, tmpdir):
     path = str(tmpdir.join('vocab.txt'))
     vocab.save(path)
     new_vocab = SimpleVocab.load(path)
     assert vocab == new_vocab