Exemplo n.º 1
0
    def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
Exemplo n.º 2
0
    def test_multilabel_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("rel0", namespace="rel_labels")
        vocab.add_token_to_namespace("rel1", namespace="rel_labels")
        vocab.add_token_to_namespace("rel2", namespace="rel_labels")

        f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
Exemplo n.º 3
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_token_to_namespace("a", namespace="tokens1")
        original_vocab.add_token_to_namespace("b", namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField([Token(t) for t in ["a" "c"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
                                {"tokens2": SingleIdTokenIndexer("tokens2")})
        instances = Batch([Instance({"text1": text_field1, "text2": text_field2})])

        # Following 2 should give error: token1 is non-padded in original_vocab but not in instances
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": []})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": []})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=[],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1"]})
        Vocabulary.from_params(params, instances)
        extended_vocab = copy.copy(original_vocab)
        params = Params({"non_padded_namespaces": ["tokens1"]})
        extended_vocab.extend_from_instances(params, instances)
        extended_vocab = copy.copy(original_vocab)
        extended_vocab._extend(non_padded_namespaces=["tokens1"],
                               tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should give error: token1 is padded in instances but not in original_vocab
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens2"]})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
Exemplo n.º 4
0
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('the')
        vocab.add_token_to_namespace('a')
        embedding_layer = get_pretrained_embedding_layer(
            'tests/fixtures/glove.6B.300d.sample.txt.gz',
            vocab,
            projection_dim=20)
        input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
Exemplo n.º 5
0
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
Exemplo n.º 6
0
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
Exemplo n.º 7
0
    def test_pad_produces_one_hot_targets(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("B", namespace='*tags')
        vocab.add_token_to_namespace("I", namespace='*tags')
        vocab.add_token_to_namespace("O", namespace='*tags')

        tags = ["B", "I", "O", "O", "O"]
        tag_field = TagField(tags, self.text, tag_namespace="*tags")
        tag_field.index(vocab)
        padding_lengths = tag_field.get_padding_lengths()
        array = tag_field.as_array(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            array,
            numpy.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0,
                                                                      1]]))
Exemplo n.º 8
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        b_index = vocab.add_token_to_namespace("B", namespace='*tags')
        i_index = vocab.add_token_to_namespace("I", namespace='*tags')
        o_index = vocab.add_token_to_namespace("O", namespace='*tags')

        tags = ["B", "I", "O", "O", "O"]
        tag_field = TagField(tags, self.text, tag_namespace="*tags")
        tag_field.index(vocab)

        # pylint: disable=protected-access
        assert tag_field._indexed_tags == [
            b_index, i_index, o_index, o_index, o_index
        ]
        assert tag_field._num_tags == 3
Exemplo n.º 9
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_tokens_to_namespace(["a", "b"], namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField(
            [Token(t) for t in ["a", "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}
        )
        text_field2 = TextField(
            [Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}
        )
        instances = Batch([Instance({"text1": text_field1, "text2": text_field2})])

        # Following 2 should give error: tokens1 is non-padded in original_vocab but not in instances
        params = Params(
            {
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": [],
                "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]},
            }
        )
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params(
            {
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": ["tokens1"],
                "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]},
            }
        )
        Vocabulary.from_params(params, instances=instances)

        # Following 2 should give error: tokens2 is padded in instances but not in original_vocab
        params = Params(
            {
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": ["tokens1", "tokens2"],
                "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]},
            }
        )
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)
Exemplo n.º 10
0
    def test_pad_produces_one_hot_targets(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("B", namespace='*tags')
        vocab.add_token_to_namespace("I", namespace='*tags')
        vocab.add_token_to_namespace("O", namespace='*tags')

        text = TextField(["here", "are", "some", "words", "."],
                         [token_indexers["single id"]("words")])
        tags = ["B", "I", "O", "O", "O"]
        tag_field = TagField(tags, text, tag_namespace="*tags")
        tag_field.index(vocab)
        padding_lengths = tag_field.get_padding_lengths()
        array = tag_field.pad(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            array,
            numpy.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0,
                                                                      1]]))
Exemplo n.º 11
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        b_index = vocab.add_token_to_namespace("B", namespace='*tags')
        i_index = vocab.add_token_to_namespace("I", namespace='*tags')
        o_index = vocab.add_token_to_namespace("O", namespace='*tags')

        text = TextField(["here", "are", "some", "words", "."],
                         [token_indexers["single id"]("words")])
        tags = ["B", "I", "O", "O", "O"]
        tag_field = TagField(tags, text, tag_namespace="*tags")
        tag_field.index(vocab)

        # pylint: disable=protected-access
        assert tag_field._indexed_tags == [
            b_index, i_index, o_index, o_index, o_index
        ]
        assert tag_field._num_tags == 3
Exemplo n.º 12
0
def _get_vocab(words_by_freq, max_v_sizes, word_freq_thresh):
    """Build vocabulary by selecting the most frequent tokens"""
    vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes)

    words_by_freq = dict(words_by_freq)
    try:
        words_by_freq.pop("<unk>")  # remove special token, TODO
    except KeyError:
        pass

    for special in SPECIALS:
        vocab.add_token_to_namespace(special, "tokens")

    for word, freq in list(words_by_freq.items())[: max_v_sizes["word"]]:
        if freq >= word_freq_thresh:
            vocab.add_token_to_namespace(word, "tokens")

    return vocab
Exemplo n.º 13
0
    def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
        g = f.empty_field()
        g.index(vocab)
        tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))

        h = MultiLabelField(
            [0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True
        )
        tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))
Exemplo n.º 14
0
def main():
    infilename = 'test/fixtures/bioul_to_span.json'
    with open(infilename) as f:
        d = json.load(f)

    docs = d['tag']
    vocab = Vocabulary()
    vocab.add_token_to_namespace(
        'O', namespace='span_labels')  # reserved label for no-entity
    for doc in docs:
        for label in doc:
            if label != 'O':
                span_label = label[
                    2:]  # drop the first two character because they are not useful for span labels
                vocab.add_token_to_namespace(
                    span_label, namespace='span_labels'
                )  # TODO: is this the right namespace?

    # this function is expecting the vocab is already initialized with span labels
    batched_bioul_to_span_tesnors(docs, vocab)
Exemplo n.º 15
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
Exemplo n.º 16
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
Exemplo n.º 17
0
def test():
    from pprint import pprint

    params = Params(
        {'token_embedder': {
            'num_embeddings': 4,
            'embedding_dim': 3
        }})
    vocab = Vocabulary()
    while True:
        vocab_size = vocab.get_vocab_size()
        if vocab_size == 4:
            break
        vocab.add_token_to_namespace('a' + str(vocab_size))
    model = BaselineModel(params=params, vocab=vocab)
    premise = {'tokens': torch.randint(low=0, high=4, size=(5, 6))}
    hypothesis = {'tokens': torch.randint(low=0, high=4, size=(5, 7))}
    label = torch.randint(low=0, high=3, size=(5, ))
    output = model(premise=premise, hypothesis=hypothesis, label=label)
    pprint(output)
    pprint(model.get_metrics())
Exemplo n.º 18
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")
        vocab.save_to_files(vocab_dir)

        params = Params({"directory_path": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@',
                                                                  1: '@@UNKNOWN@@',
                                                                  2: 'a', 3: 'c', 4: 'b'}
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'directory_path' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
Exemplo n.º 19
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")
        vocab.save_to_files(vocab_dir)

        params = Params({"directory_path": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@',
                                                                  1: '@@UNKNOWN@@',
                                                                  2: 'a', 3: 'c', 4: 'b'}
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'vocabulary_directory' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
Exemplo n.º 20
0
def get_vocab(
    word2freq: Dict[str, int], char2freq: Dict[str, int], max_v_sizes: Dict[str, int]
) -> Vocabulary:
    """Build vocabulary by selecting the most frequent tokens

    Parameters
    ----------
    word2freq : Dict[str, int]
        Dict mapping words to frequencies.
    char2freq : Dict[str, int]
        Dict mapping chars to frequencies.
    max_v_sizes : dict[str: int]
        Dict used to set max vocab size for each token namespace.

    Returns
    -------
    allennlp.data.Vocabulary
        vocab containing word and char namespaces.

    """
    vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes)
    for special in SPECIALS:
        vocab.add_token_to_namespace(special, "tokens")

    words_by_freq = [(word, freq) for word, freq in word2freq.items()]
    words_by_freq.sort(key=lambda x: x[1], reverse=True)
    for word, _ in words_by_freq[: max_v_sizes["word"]]:
        vocab.add_token_to_namespace(word, "tokens")

    chars_by_freq = [(char, freq) for char, freq in char2freq.items()]
    chars_by_freq.sort(key=lambda x: x[1], reverse=True)
    for char, _ in chars_by_freq[: max_v_sizes["char"]]:
        vocab.add_token_to_namespace(char, "chars")

    return vocab
Exemplo n.º 21
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField(["A", "sentence"], {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField(["A", "sentence"], {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField(["A", "sentence"],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
Exemplo n.º 22
0
    def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField([Token(t) for t in ["a", "b"]],
                               {"tokens": SingleIdTokenIndexer("tokens")})
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"type": "extend", "directory": vocab_dir})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"type": "extend"})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)
Exemplo n.º 23
0
    def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField([Token(t) for t in ["a", "b"]],
                               {"tokens": SingleIdTokenIndexer("tokens")})
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory_path`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"directory_path": vocab_dir, "extend": True})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory_path` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"extend": True})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
Exemplo n.º 24
0
def test_unlabeled():
    from pprint import pprint

    params = Params({
        'token_embedder': {
            'num_embeddings': 4,
            'embedding_dim': 300
        },
        'code_dist_type': 'gaussian'
    })
    vocab = Vocabulary()
    while True:
        vocab_size = vocab.get_vocab_size()
        if vocab_size == 4:
            break
        vocab.add_token_to_namespace('a' + str(vocab_size))
    model = DeconvSNLIModel(params=params, vocab=vocab)
    premise = {'tokens': torch.randint(low=0, high=4, size=(5, 29))}
    hypothesis = {'tokens': torch.randint(low=0, high=4, size=(5, 29))}
    output = model(premise=premise, hypothesis=hypothesis, label=None)
    pprint(output)
    pprint(model.get_metrics())
Exemplo n.º 25
0
 def test_vocab_can_print(self):
     vocab = Vocabulary(non_padded_namespaces=["a", "c"])
     vocab.add_token_to_namespace("a0", namespace="a")
     vocab.add_token_to_namespace("a1", namespace="a")
     vocab.add_token_to_namespace("a2", namespace="a")
     vocab.add_token_to_namespace("b2", namespace="b")
     vocab.add_token_to_namespace("b3", namespace="b")
     print(vocab)
 def test_vocab_can_print(self):
     vocab = Vocabulary(non_padded_namespaces=["a", "c"])
     vocab.add_token_to_namespace("a0", namespace="a")
     vocab.add_token_to_namespace("a1", namespace="a")
     vocab.add_token_to_namespace("a2", namespace="a")
     vocab.add_token_to_namespace("b2", namespace="b")
     vocab.add_token_to_namespace("b3", namespace="b")
     print(vocab)
Exemplo n.º 27
0
 def _build_vocabulary(self, summaries: List[List[str]]):
     vocab = Vocabulary()
     vocab.add_token_to_namespace(START_SYMBOL)
     vocab.add_token_to_namespace(END_SYMBOL)
     for summary in summaries:
         for sentence in summary:
             for token in sentence.split():
                 vocab.add_token_to_namespace(token)
     return vocab
Exemplo n.º 28
0
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = self.TEST_DIR / u'vocab_save'

        vocab = Vocabulary(non_padded_namespaces=[u"a", u"c"])
        vocab.add_token_to_namespace(
            u"a0", namespace=u"a")  # non-padded, should start at 0
        vocab.add_token_to_namespace(u"a1", namespace=u"a")
        vocab.add_token_to_namespace(u"a2", namespace=u"a")
        vocab.add_token_to_namespace(
            u"b2", namespace=u"b")  # padded, should start at 2
        vocab.add_token_to_namespace(u"b3", namespace=u"b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == set([u"a", u"c"])

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace=u'a') == 3
        assert vocab2.get_token_from_index(0, namespace=u'a') == u'a0'
        assert vocab2.get_token_from_index(1, namespace=u'a') == u'a1'
        assert vocab2.get_token_from_index(2, namespace=u'a') == u'a2'
        assert vocab2.get_token_index(u'a0', namespace=u'a') == 0
        assert vocab2.get_token_index(u'a1', namespace=u'a') == 1
        assert vocab2.get_token_index(u'a2', namespace=u'a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(
            namespace=u'b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(
            0, namespace=u'b') == vocab._padding_token
        assert vocab2.get_token_from_index(1,
                                           namespace=u'b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace=u'b') == u'b2'
        assert vocab2.get_token_from_index(3, namespace=u'b') == u'b3'
        assert vocab2.get_token_index(vocab._padding_token,
                                      namespace=u'b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace=u'b') == 1
        assert vocab2.get_token_index(u'b2', namespace=u'b') == 2
        assert vocab2.get_token_index(u'b3', namespace=u'b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary(
            u"a") == vocab2.get_index_to_token_vocabulary(u"a")
        assert vocab.get_index_to_token_vocabulary(
            u"b") == vocab2.get_index_to_token_vocabulary(u"b")
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths())
        assert tensor.item() == 0
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths()).data.cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0]))
Exemplo n.º 31
0
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="*labels")
        vocab.add_token_to_namespace("contradiction", namespace="*labels")
        vocab.add_token_to_namespace("neutral", namespace="*labels")

        label = LabelField("entailment")
        label.index(vocab)
        array = label.pad(label.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(array[0],
                                                numpy.array([1, 0, 0]))
Exemplo n.º 32
0
    def test_token_to_indices_produces_correct_characters(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')
        vocab.add_token_to_namespace("s", namespace='characters')
        vocab.add_token_to_namespace("e", namespace='characters')
        vocab.add_token_to_namespace("n", namespace='characters')
        vocab.add_token_to_namespace("t", namespace='characters')
        vocab.add_token_to_namespace("c", namespace='characters')

        indexer = TokenCharactersIndexer("characters")
        indices = indexer.token_to_indices("sentential", vocab)
        assert indices == [3, 4, 5, 6, 4, 5, 6, 1, 1, 1]
    def test_multilabel_field_returns_correct_empty_sequence(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")
        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.empty_field()

        vocab = Vocabulary()
        vocab.add_token_to_namespace("rel0", namespace="rel_labels")
        vocab.add_token_to_namespace("rel1", namespace="rel_labels")
        vocab.add_token_to_namespace("rel2", namespace="rel_labels")

        f = MultiLabelField(["rel1", "rel0"], label_namespace="rel_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        f.empty_field()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([1, 1, 0]))
Exemplo n.º 34
0
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = self.TEST_DIR / 'vocab_save'

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == {"a", "c"}

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace='a') == 3
        assert vocab2.get_token_from_index(0, namespace='a') == 'a0'
        assert vocab2.get_token_from_index(1, namespace='a') == 'a1'
        assert vocab2.get_token_from_index(2, namespace='a') == 'a2'
        assert vocab2.get_token_index('a0', namespace='a') == 0
        assert vocab2.get_token_index('a1', namespace='a') == 1
        assert vocab2.get_token_index('a2', namespace='a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace='b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace='b') == 'b2'
        assert vocab2.get_token_from_index(3, namespace='b') == 'b3'
        assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1
        assert vocab2.get_token_index('b2', namespace='b') == 2
        assert vocab2.get_token_index('b3', namespace='b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
Exemplo n.º 35
0
class TestTextField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("sentence", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='characters')
        self.vocab.add_token_to_namespace("s", namespace='characters')
        self.vocab.add_token_to_namespace("e", namespace='characters')
        self.vocab.add_token_to_namespace("n", namespace='characters')
        self.vocab.add_token_to_namespace("t", namespace='characters')
        self.vocab.add_token_to_namespace("c", namespace='characters')
        super(TestTextField, self).setUp()

    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers=[token_indexers["single id"]("words")])
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers=[token_indexers["characters"]("characters")])
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(["This", "is", "a", "sentence", "."],
                          token_indexers=[
                              token_indexers["single id"]("words"),
                              token_indexers["characters"]("characters")
                          ])
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}

    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField(
            ["A", "sentence"],
            [token_indexers["single id"](token_namespace="words")])
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens == [[capital_a_index, sentence_index]]

        field1 = TextField(
            ["A", "sentence"],
            [token_indexers["characters"](character_namespace="characters")])
        field1.index(vocab)
        assert field1._indexed_tokens == [[[capital_a_char_index],
                                           [
                                               s_index, e_index, n_index,
                                               t_index, e_index, n_index,
                                               c_index, e_index
                                           ]]]
        field2 = TextField(
            ["A", "sentence"],
            token_indexers=[
                token_indexers["single id"](token_namespace="words"),
                token_indexers["characters"](character_namespace="characters")
            ])
        field2.index(vocab)
        assert field2._indexed_tokens == [[capital_a_index, sentence_index],
                                          [[capital_a_char_index],
                                           [
                                               s_index, e_index, n_index,
                                               t_index, e_index, n_index,
                                               c_index, e_index
                                           ]]]
        # pylint: enable=protected-access

    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers=[token_indexers["single id"]("words")])
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers=[token_indexers["single id"]("words")])
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5}

        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers=[token_indexers["characters"]("characters")])
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

        field = TextField(["This", "is", "a", "sentence", "."],
                          token_indexers=[
                              token_indexers["characters"]("characters"),
                              token_indexers["single id"]("words")
                          ])
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

    def test_pad_handles_words(self):
        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers=[token_indexers["single id"]("words")])
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        arrays = field.pad(padding_lengths)
        numpy.testing.assert_array_almost_equal(arrays[0],
                                                numpy.array([1, 1, 1, 2, 1]))

    def test_pad_handles_longer_lengths(self):
        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers=[token_indexers["single id"]("words")])
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 10
        arrays = field.pad(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            arrays[0], numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]))

    def test_pad_handles_characters(self):
        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers=[token_indexers["characters"]("characters")])
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        arrays = field.pad(padding_lengths)
        expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                                [1, 3, 0, 0, 0, 0, 0, 0],
                                                [1, 0, 0, 0, 0, 0, 0, 0],
                                                [3, 4, 5, 6, 4, 5, 7, 4],
                                                [1, 0, 0, 0, 0, 0, 0, 0]])
        numpy.testing.assert_array_almost_equal(arrays[0],
                                                expected_character_array)

    def test_pad_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(["a", "sentence", "."],
                          token_indexers=[
                              token_indexers["single id"]("words"),
                              token_indexers["characters"]("characters")
                          ])
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 5
        padding_lengths["num_token_characters"] = 10
        arrays = field.pad(padding_lengths)

        numpy.testing.assert_array_almost_equal(arrays[0],
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            arrays[1],
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Exemplo n.º 36
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        """
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        """

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(
            non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens0")  # index:2
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens0")  # index:3
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens0")  # index:4

        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens1")  # index:0
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens1")  # index:1
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens1")  # index:2

        original_vocab.add_token_to_namespace("a",
                                              namespace="tokens2")  # index:0
        original_vocab.add_token_to_namespace("b",
                                              namespace="tokens2")  # index:1
        original_vocab.add_token_to_namespace("c",
                                              namespace="tokens2")  # index:2

        original_vocab.add_token_to_namespace("p",
                                              namespace="tokens3")  # index:0
        original_vocab.add_token_to_namespace("q",
                                              namespace="tokens3")  # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens0": SingleIdTokenIndexer("tokens0")},
        )
        text_field1 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens1": SingleIdTokenIndexer("tokens1")},
        )
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([
            Instance({
                "text0": text_field0,
                "text1": text_field1,
                "text4": text_field4,
                "text5": text_field5,
            })
        ])

        params = Params({
            "type": "extend",
            "directory": vocab_dir,
            "non_padded_namespaces": ["tokens1", "tokens5"],
        })
        extended_vocab = Vocabulary.from_params(params, instances=instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {
            "tokens1", "tokens3", "tokens5"
        }

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size(
            "tokens0") == 8  # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size(
            "tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens4") == 6  # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3  # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(
                    token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(
                    index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(
                    index, namespace)
                assert vocab_token == extended_vocab_token
Exemplo n.º 37
0
class TestTransformerToolkit(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()
        self.vocab = Vocabulary()
        # populate vocab.
        self.vocab.add_token_to_namespace("word")
        self.vocab.add_token_to_namespace("the")
        self.vocab.add_token_to_namespace("an")

    def test_create_embedder_using_toolkit(self):

        embedding_file = str(self.FIXTURES_ROOT /
                             "embeddings/glove.6B.300d.sample.txt.gz")

        class TinyTransformer(TokenEmbedder):
            def __init__(self, vocab, embedding_dim, hidden_size,
                         intermediate_size):
                super().__init__()
                self.embeddings = Embedding(
                    pretrained_file=embedding_file,
                    embedding_dim=embedding_dim,
                    projection_dim=hidden_size,
                    vocab=vocab,
                )

                self.transformer = TransformerStack(
                    num_hidden_layers=4,
                    hidden_size=hidden_size,
                    intermediate_size=intermediate_size,
                )

            @overrides
            def forward(self, token_ids: torch.LongTensor):
                x = self.embeddings(token_ids)
                x = self.transformer(x)
                return x

        tiny = TinyTransformer(self.vocab,
                               embedding_dim=300,
                               hidden_size=80,
                               intermediate_size=40)
        tiny.forward(torch.LongTensor([[0, 1, 2]]))

    def test_use_first_four_layers_of_pretrained(self):
        pretrained = "bert-base-cased"

        class SmallTransformer(TokenEmbedder):
            def __init__(self):
                super().__init__()
                self.embeddings = TransformerEmbeddings.from_pretrained_module(
                    pretrained, relevant_module="bert.embeddings")
                self.transformer = TransformerStack.from_pretrained_module(
                    pretrained,
                    num_hidden_layers=4,
                    relevant_module="bert.encoder",
                    strict=False,
                )

            @overrides
            def forward(self, token_ids: torch.LongTensor):
                x = self.embeddings(token_ids)
                x = self.transformer(x)
                return x

        small = SmallTransformer()
        assert len(small.transformer.layers) == 4
        small(torch.LongTensor([[0, 1, 2]]))

    def test_use_selected_layers_of_bert_for_different_purposes(self):
        class MediumTransformer(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.embeddings = TransformerEmbeddings.from_pretrained_module(
                    "bert-base-cased", relevant_module="bert.embeddings")
                self.separate_transformer = TransformerStack.from_pretrained_module(
                    "bert-base-cased",
                    relevant_module="bert.encoder",
                    num_hidden_layers=8,
                    strict=False,
                )
                self.combined_transformer = TransformerStack.from_pretrained_module(
                    "bert-base-cased",
                    relevant_module="bert.encoder",
                    num_hidden_layers=4,
                    mapping={
                        f"layer.{l}": f"layers.{i}"
                        for (i, l) in enumerate(range(8, 12))
                    },
                    strict=False,
                )

            @overrides
            def forward(
                self,
                left_token_ids: torch.LongTensor,
                right_token_ids: torch.LongTensor,
            ):

                left = self.embeddings(left_token_ids)
                left = self.separate_transformer(left)

                right = self.embeddings(right_token_ids)
                right = self.separate_transformer(right)

                # combine the sequences in some meaningful way. here, we just add them.
                # combined = combine_masked_sequences(left, left_mask, right, right_mask)
                combined = left + right

                return self.combined_transformer(combined)

        medium = MediumTransformer()
        assert (len(medium.separate_transformer.layers)) == 8
        assert (len(medium.combined_transformer.layers)) == 4

        pretrained = cached_transformers.get("bert-base-cased", False)
        pretrained_layers = dict(pretrained.encoder.layer.named_modules())

        separate_layers = dict(
            medium.separate_transformer.layers.named_modules())
        assert_allclose(
            separate_layers["0"].intermediate.dense.weight.data,
            pretrained_layers["0"].intermediate.dense.weight.data,
        )

        combined_layers = dict(
            medium.combined_transformer.layers.named_modules())
        assert_allclose(
            combined_layers["0"].intermediate.dense.weight.data,
            pretrained_layers["8"].intermediate.dense.weight.data,
        )
        assert_allclose(
            combined_layers["1"].intermediate.dense.weight.data,
            pretrained_layers["9"].intermediate.dense.weight.data,
        )
        assert_allclose(
            combined_layers["2"].intermediate.dense.weight.data,
            pretrained_layers["10"].intermediate.dense.weight.data,
        )
        assert_allclose(
            combined_layers["3"].intermediate.dense.weight.data,
            pretrained_layers["11"].intermediate.dense.weight.data,
        )

    def test_combination_of_two_different_berts(self):
        # Regular BERT, but with AlBERT's special compressed embedding scheme

        class AlmostRegularTransformer(TokenEmbedder):
            def __init__(self):
                super().__init__()
                self.embeddings = AutoModel.from_pretrained(
                    "albert-base-v2").embeddings
                self.transformer = TransformerStack.from_pretrained_module(
                    "bert-base-cased", relevant_module="bert.encoder")
                # We want to tune only the embeddings, because that's our experiment.
                self.transformer.requires_grad = False

            @overrides
            def forward(self, token_ids: torch.LongTensor,
                        mask: torch.BoolTensor):
                x = self.embeddings(token_ids, mask)
                x = self.transformer(x)
                return x

        almost = AlmostRegularTransformer()
        assert len(almost.transformer.layers) == 12
        assert isinstance(almost.embeddings, AlbertEmbeddings)

    @pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"])
    def test_end_to_end(self, model_name: str):
        data = [
            ("I'm against picketing", "but I don't know how to show it."),
            ("I saw a human pyramid once.", "It was very unnecessary."),
        ]
        tokenizer = cached_transformers.get_tokenizer(model_name)
        batch = tokenizer.batch_encode_plus(data,
                                            padding=True,
                                            return_tensors="pt")

        with torch.no_grad():
            huggingface_model = cached_transformers.get(
                model_name, make_copy=False).eval()
            huggingface_output = huggingface_model(**batch)

            embeddings = TransformerEmbeddings.from_pretrained_module(
                model_name).eval()
            transformer_stack = TransformerStack.from_pretrained_module(
                model_name).eval()
            pooler = TransformerPooler.from_pretrained_module(
                model_name).eval()
            batch["attention_mask"] = batch["attention_mask"].to(torch.bool)
            output = embeddings(**batch)
            output = transformer_stack(output, batch["attention_mask"])

            assert_allclose(
                output.final_hidden_states,
                huggingface_output.last_hidden_state,
                rtol=0.0001,
                atol=1e-4,
            )

            output = pooler(output.final_hidden_states)
            assert_allclose(output,
                            huggingface_output.pooler_output,
                            rtol=0.0001,
                            atol=1e-4)
Exemplo n.º 38
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / "vocab_save"
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_tokens_to_namespace(["d", "a", "b"],
                                                   namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            vocab_dir = self.TEST_DIR / "vocab_save"
            shutil.rmtree(vocab_dir, ignore_errors=True)
            original_vocab.save_to_files(vocab_dir)
            instances = Batch([Instance({"text": text_field})])
            params = Params({
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": non_padded_namespaces,
            })
            extended_vocab = Vocabulary.from_params(params,
                                                    instances=instances)

            extra_count = 2 if extended_vocab.is_padded("tokens") else 0
            assert extended_vocab.get_token_index("d",
                                                  "tokens") == 0 + extra_count
            assert extended_vocab.get_token_index("a",
                                                  "tokens") == 1 + extra_count
            assert extended_vocab.get_token_index("b",
                                                  "tokens") == 2 + extra_count

            assert extended_vocab.get_token_index(
                "c", "tokens")  # should be present
            assert extended_vocab.get_token_index(
                "e", "tokens")  # should be present

            assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace(
                "a", namespace="tokens1")  # index2
            text_field = TextField(
                [Token(t) for t in ["b"]],
                {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])
            vocab_dir = self.TEST_DIR / "vocab_save"
            shutil.rmtree(vocab_dir, ignore_errors=True)
            original_vocab.save_to_files(vocab_dir)

            params = Params({
                "type": "extend",
                "directory": vocab_dir,
                "non_padded_namespaces": non_padded_namespaces,
            })
            extended_vocab = Vocabulary.from_params(params,
                                                    instances=instances)

            # Should have two namespaces
            assert len(extended_vocab._token_to_index) == 2

            extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
            assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count

            extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
            assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
Exemplo n.º 39
0
class TestDataset(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", "words")
        self.vocab.add_token_to_namespace(".", "words")
        super(TestDataset, self).setUp()

    def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({"tag": (LabelField(1))})
        instance2 = Instance({"words": TextField(["hello"], [])})
        with pytest.raises(ConfigurationError):
            _ = Dataset([instance1, instance2])

    def test_padding_lengths_uses_max_instance_lengths(self):
        dataset = self.get_dataset()
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        assert padding_lengths == {
            "text1": {
                "num_tokens": 5
            },
            "text2": {
                "num_tokens": 6
            }
        }

    def test_as_arrays(self):
        dataset = self.get_dataset()
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        arrays = dataset.as_arrays(padding_lengths)

        text1 = arrays["text1"][0]
        text2 = arrays["text2"][0]
        numpy.testing.assert_array_almost_equal(
            text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(
            text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))

    def get_dataset(self):
        field1 = TextField(["this", "is", "a", "sentence", "."],
                           [token_indexers["single id"]("words")])
        field2 = TextField(["this", "is", "a", "different", "sentence", "."],
                           [token_indexers["single id"]("words")])
        field3 = TextField(["here", "is", "a", "sentence", "."],
                           [token_indexers["single id"]("words")])
        field4 = TextField(["this", "is", "short"],
                           [token_indexers["single id"]("words")])
        instances = [
            Instance({
                "text1": field1,
                "text2": field2
            }),
            Instance({
                "text1": field3,
                "text2": field4
            })
        ]

        return Dataset(instances)
Exemplo n.º 40
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        '''
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        '''

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2
        original_vocab.add_token_to_namespace("bat", namespace="tokens0")   # index:3
        original_vocab.add_token_to_namespace("cat", namespace="tokens0")   # index:4

        original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0
        original_vocab.add_token_to_namespace("bat", namespace="tokens1")   # index:1
        original_vocab.add_token_to_namespace("cat", namespace="tokens1")   # index:2

        original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0
        original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1
        original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2

        original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0
        original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens0": SingleIdTokenIndexer("tokens0")})
        text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([Instance({"text0": text_field0, "text1": text_field1,
                                     "text4": text_field4, "text5": text_field5})])

        params = Params({"directory_path": vocab_dir,
                         "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens5"]})
        extended_vocab = Vocabulary.from_params(params, instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"}

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(index, namespace)
                assert vocab_token == extended_vocab_token
Exemplo n.º 41
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        extension_ways = ["from_params", "extend_from_instances"]
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace("d", namespace="tokens")
            original_vocab.add_token_to_namespace("a", namespace="tokens")
            original_vocab.add_token_to_namespace("b", namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            instances = Batch([Instance({"text": text_field})])
            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params({"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({"directory_path": vocab_dir, "extend": True,
                                     "non_padded_namespaces": non_padded_namespaces})
                    extended_vocab = Vocabulary.from_params(params, instances)

                extra_count = 2 if extended_vocab.is_padded("tokens") else 0
                assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count
                assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count
                assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count

                assert extended_vocab.get_token_index("c", "tokens") # should be present
                assert extended_vocab.get_token_index("e", "tokens") # should be present

                assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[],
                                      ["tokens1"],
                                      ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2
            text_field = TextField([Token(t) for t in ["b"]],
                                   {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])

            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params({"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({"directory_path": vocab_dir, "extend": True,
                                     "non_padded_namespaces": non_padded_namespaces})
                    extended_vocab = Vocabulary.from_params(params, instances)

                # Should have two namespaces
                assert len(extended_vocab._token_to_index) == 2

                extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
                assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count

                extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
                assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count