Exemplo n.º 1
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")
        vocab.save_to_files(vocab_dir)

        params = Params({"directory_path": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@',
                                                                  1: '@@UNKNOWN@@',
                                                                  2: 'a', 3: 'c', 4: 'b'}
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'directory_path' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
Exemplo n.º 2
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_token_to_namespace("a", namespace="tokens1")
        original_vocab.add_token_to_namespace("b", namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField([Token(t) for t in ["a" "c"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
                                {"tokens2": SingleIdTokenIndexer("tokens2")})
        instances = Batch([Instance({"text1": text_field1, "text2": text_field2})])

        # Following 2 should give error: token1 is non-padded in original_vocab but not in instances
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": []})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": []})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=[],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1"]})
        Vocabulary.from_params(params, instances)
        extended_vocab = copy.copy(original_vocab)
        params = Params({"non_padded_namespaces": ["tokens1"]})
        extended_vocab.extend_from_instances(params, instances)
        extended_vocab = copy.copy(original_vocab)
        extended_vocab._extend(non_padded_namespaces=["tokens1"],
                               tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})

        # Following 2 should give error: token1 is padded in instances but not in original_vocab
        params = Params({"directory_path": vocab_dir, "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens2"]})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"],
                                   tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
Exemplo n.º 3
0
    def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField([Token(t) for t in ["a", "b"]],
                               {"tokens": SingleIdTokenIndexer("tokens")})
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory_path`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"directory_path": vocab_dir, "extend": True})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory_path` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"extend": True})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
Exemplo n.º 4
0
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = self.TEST_DIR / 'vocab_save'

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == {"a", "c"}

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace='a') == 3
        assert vocab2.get_token_from_index(0, namespace='a') == 'a0'
        assert vocab2.get_token_from_index(1, namespace='a') == 'a1'
        assert vocab2.get_token_from_index(2, namespace='a') == 'a2'
        assert vocab2.get_token_index('a0', namespace='a') == 0
        assert vocab2.get_token_index('a1', namespace='a') == 1
        assert vocab2.get_token_index('a2', namespace='a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace='b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace='b') == 'b2'
        assert vocab2.get_token_from_index(3, namespace='b') == 'b3'
        assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1
        assert vocab2.get_token_index('b2', namespace='b') == 2
        assert vocab2.get_token_index('b3', namespace='b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
Exemplo n.º 5
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        '''
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        '''

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2
        original_vocab.add_token_to_namespace("bat", namespace="tokens0")   # index:3
        original_vocab.add_token_to_namespace("cat", namespace="tokens0")   # index:4

        original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0
        original_vocab.add_token_to_namespace("bat", namespace="tokens1")   # index:1
        original_vocab.add_token_to_namespace("cat", namespace="tokens1")   # index:2

        original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0
        original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1
        original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2

        original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0
        original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens0": SingleIdTokenIndexer("tokens0")})
        text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([Instance({"text0": text_field0, "text1": text_field1,
                                     "text4": text_field4, "text5": text_field5})])

        params = Params({"directory_path": vocab_dir,
                         "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens5"]})
        extended_vocab = Vocabulary.from_params(params, instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"}

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(index, namespace)
                assert vocab_token == extended_vocab_token
Exemplo n.º 6
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        extension_ways = ["from_params", "extend_from_instances"]
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace("d", namespace="tokens")
            original_vocab.add_token_to_namespace("a", namespace="tokens")
            original_vocab.add_token_to_namespace("b", namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            instances = Batch([Instance({"text": text_field})])
            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params({"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({"directory_path": vocab_dir, "extend": True,
                                     "non_padded_namespaces": non_padded_namespaces})
                    extended_vocab = Vocabulary.from_params(params, instances)

                extra_count = 2 if extended_vocab.is_padded("tokens") else 0
                assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count
                assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count
                assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count

                assert extended_vocab.get_token_index("c", "tokens") # should be present
                assert extended_vocab.get_token_index("e", "tokens") # should be present

                assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[],
                                      ["tokens1"],
                                      ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2
            text_field = TextField([Token(t) for t in ["b"]],
                                   {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])

            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params({"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({"directory_path": vocab_dir, "extend": True,
                                     "non_padded_namespaces": non_padded_namespaces})
                    extended_vocab = Vocabulary.from_params(params, instances)

                # Should have two namespaces
                assert len(extended_vocab._token_to_index) == 2

                extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
                assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count

                extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
                assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
Exemplo n.º 7
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        """
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        """

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(
            non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens0")  # index:2
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens0")  # index:3
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens0")  # index:4

        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens1")  # index:0
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens1")  # index:1
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens1")  # index:2

        original_vocab.add_token_to_namespace("a",
                                              namespace="tokens2")  # index:0
        original_vocab.add_token_to_namespace("b",
                                              namespace="tokens2")  # index:1
        original_vocab.add_token_to_namespace("c",
                                              namespace="tokens2")  # index:2

        original_vocab.add_token_to_namespace("p",
                                              namespace="tokens3")  # index:0
        original_vocab.add_token_to_namespace("q",
                                              namespace="tokens3")  # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField([
            Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]
        ], {"tokens0": SingleIdTokenIndexer("tokens0")})
        text_field1 = TextField([
            Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]
        ], {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([
            Instance({
                "text0": text_field0,
                "text1": text_field1,
                "text4": text_field4,
                "text5": text_field5
            })
        ])

        params = Params({
            "directory_path": vocab_dir,
            "extend": True,
            "non_padded_namespaces": ["tokens1", "tokens5"]
        })
        extended_vocab = Vocabulary.from_params(params, instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {
            "tokens1", "tokens3", "tokens5"
        }

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size(
            "tokens0") == 8  # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size(
            "tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens4") == 6  # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3  # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(
                    token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(
                    index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(
                    index, namespace)
                assert vocab_token == extended_vocab_token
Exemplo n.º 8
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        extension_ways = ["from_params", "extend_from_instances"]
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_tokens_to_namespace(["d", "a", "b"],
                                                   namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            instances = Batch([Instance({"text": text_field})])
            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params(
                        {"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({
                        "directory_path":
                        vocab_dir,
                        "extend":
                        True,
                        "non_padded_namespaces":
                        non_padded_namespaces
                    })
                    extended_vocab = Vocabulary.from_params(params, instances)

                extra_count = 2 if extended_vocab.is_padded("tokens") else 0
                assert extended_vocab.get_token_index(
                    "d", "tokens") == 0 + extra_count
                assert extended_vocab.get_token_index(
                    "a", "tokens") == 1 + extra_count
                assert extended_vocab.get_token_index(
                    "b", "tokens") == 2 + extra_count

                assert extended_vocab.get_token_index(
                    "c", "tokens")  # should be present
                assert extended_vocab.get_token_index(
                    "e", "tokens")  # should be present

                assert extended_vocab.get_vocab_size(
                    "tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(
                non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace(
                "a", namespace="tokens1")  # index2
            text_field = TextField(
                [Token(t) for t in ["b"]],
                {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])

            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params(
                        {"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({
                        "directory_path":
                        vocab_dir,
                        "extend":
                        True,
                        "non_padded_namespaces":
                        non_padded_namespaces
                    })
                    extended_vocab = Vocabulary.from_params(params, instances)

                # Should have two namespaces
                assert len(extended_vocab._token_to_index) == 2

                extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
                assert extended_vocab.get_vocab_size(
                    "tokens1") == 1 + extra_count

                extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
                assert extended_vocab.get_vocab_size(
                    "tokens2") == 1 + extra_count
Exemplo n.º 9
0
    def test_invalid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1"])
        original_vocab.add_tokens_to_namespace(["a", "b"], namespace="tokens1")
        original_vocab.add_token_to_namespace("p", namespace="tokens2")
        original_vocab.save_to_files(vocab_dir)
        text_field1 = TextField([Token(t) for t in ["a", "c"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field2 = TextField([Token(t) for t in ["p", "q", "r"]],
                                {"tokens2": SingleIdTokenIndexer("tokens2")})
        instances = Batch(
            [Instance({
                "text1": text_field1,
                "text2": text_field2
            })])

        # Following 2 should give error: token1 is non-padded in original_vocab but not in instances
        params = Params({
            "directory_path": vocab_dir,
            "extend": True,
            "non_padded_namespaces": []
        })
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": []})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(non_padded_namespaces=[],
                                   tokens_to_add={
                                       "tokens1": ["a"],
                                       "tokens2": ["p"]
                                   })

        # Following 2 should not give error: overlapping namespaces have same padding setting
        params = Params({
            "directory_path": vocab_dir,
            "extend": True,
            "non_padded_namespaces": ["tokens1"]
        })
        Vocabulary.from_params(params, instances)
        extended_vocab = copy.copy(original_vocab)
        params = Params({"non_padded_namespaces": ["tokens1"]})
        extended_vocab.extend_from_instances(params, instances)
        extended_vocab = copy.copy(original_vocab)
        extended_vocab._extend(non_padded_namespaces=["tokens1"],
                               tokens_to_add={
                                   "tokens1": ["a"],
                                   "tokens2": ["p"]
                               })

        # Following 2 should give error: token1 is padded in instances but not in original_vocab
        params = Params({
            "directory_path": vocab_dir,
            "extend": True,
            "non_padded_namespaces": ["tokens1", "tokens2"]
        })
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]})
            extended_vocab.extend_from_instances(params, instances)
        with pytest.raises(ConfigurationError):
            extended_vocab = copy.copy(original_vocab)
            extended_vocab._extend(
                non_padded_namespaces=["tokens1", "tokens2"],
                tokens_to_add={
                    "tokens1": ["a"],
                    "tokens2": ["p"]
                })
Exemplo n.º 10
0
#  

loader = IrTupleDatasetReader(lazy=True,lowercase=args.lowercase)


def getInstances():
    for file in args.dataset_files:
        instances = loader.read(file)
        for i in instances:
            yield Instance({"text":i["target_tokens"]})

namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
for instance in Tqdm.tqdm(getInstances()):
    instance.count_vocab_items(namespace_token_counts)

#with open(args.out_dir,"w",encoding="utf8") as out:
#    for n in namespace_token_counts:
#        #out.write("--"+n+"\n")
#        for w,i in namespace_token_counts[n].items():
#            out.write(w+"\t"+str(i)+"\n")

vocab = Vocabulary(namespace_token_counts, min_count={"tokens":100})
vocab.save_to_files(args.out_dir)

#vocab = Vocabulary(namespace_token_counts, min_count={"tokens":50})
#vocab.save_to_files(args.out_dir2)

#vocab = Vocabulary(namespace_token_counts, min_count={"tokens":10})
#vocab.save_to_files(args.out_dir3)

Exemplo n.º 11
0
# coding=utf-8
# @Author: 莫冉
# @Date: 2020-08-06

from allennlp.data.vocabulary import Vocabulary

vocab_file = "../data/base_bert/vocab.txt"
save_path = "../../../vocab_path"

vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]")

vocab.set_from_file(vocab_file, is_padded=True, oov_token="[UNK]")

vocab.save_to_files(save_path)

print(vocab.get_token_index(vocab._oov_token))
        self.cuda_device = cuda_device

    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return expit(tonp(out_dict["class_logits"]))

    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

# Save Model & Results
with open("BERT_model.th", 'wb') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files("BERT_vocabulary")

with open('BERT_metrics.json', "w") as file:
    json.dump(metrics, file, indent=4)

with open('test_predictions_list.pkl', 'wb') as f:
    pickle.dump(test_prediction_list, f)

#
# load data & create vocab
# -------------------------------
#

loader = IrTupleDatasetReader(lazy=True,
                              source_tokenizer=BlingFireTokenizer(),
                              target_tokenizer=BlingFireTokenizer(),
                              lowercase=args.lowercase)


def getInstances():
    for file in args.dataset_files:
        instances = loader.read(file)
        for i in instances:
            yield Instance({"text": i["target_tokens"]})


namespace_token_counts: Dict[str,
                             Dict[str,
                                  int]] = defaultdict(lambda: defaultdict(int))
for instance in Tqdm.tqdm(getInstances()):
    instance.count_vocab_items(namespace_token_counts)

for count in [5, 10, 25, 50, 100]:
    vocab = Vocabulary(namespace_token_counts, min_count={"tokens": count})
    vocab.save_to_files(args.out_dir + str(count))

vocab = Vocabulary(namespace_token_counts, min_count={"tokens": 1})
vocab.save_to_files(args.out_dir + "full")
Exemplo n.º 14
0
df = pd.DataFrame()
df['y_true'] = y_true
df['y_pred'] = y_pred
df.to_csv(y_true_pred_val_path, index=False)

# %%
test_list = read_json(cached_path(TEST_PATH))
claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader)
y_pred, y_true = [], []
for tst in test_dataset:
    pred = claim_predictor.predict_instance(tst)
    logits = torch.FloatTensor(pred['logits'])
    predicted_labels = pred['labels']
    y_pred.extend(predicted_labels)
    y_true.extend(tst['labels'])
y_true = np.array(y_true).astype(int)
y_pred = np.array(y_pred).astype(int)
print('Test score:',
      precision_recall_fscore_support(y_true, y_pred, average='binary'))

# Save model
with open(f"./finetune_model.th", "wb") as f:
    torch.save(model.state_dict(), f)
vocab.save_to_files(f"./finetune_vocab")

# Save y_true and y_pred
df = pd.DataFrame()
df['y_true'] = y_true
df['y_pred'] = y_pred
df.to_csv(y_true_pred_test_path, index=False)
Exemplo n.º 15
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 text_encoder: Seq2SeqEncoder,
                 variational_autoencoder: FeedForward = None,
                 sentiment_classifier: FeedForward = None,
                 topic_dim: int = 20,
                 freeze_feature_extraction: bool = False,
                 classification_mode: bool = False,
                 pretrained_file: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(TopicRNN, self).__init__(vocab, regularizer)

        self.metrics = {
            'cross_entropy': Average(),
            'negative_kl_divergence': Average(),
            'stopword_loss': Average()
        }

        self.classification_mode = classification_mode
        if classification_mode:
            self.metrics['sentiment'] = CategoricalAccuracy()

        if pretrained_file:
            archive = load_archive(pretrained_file)
            pretrained_model = archive.model
            self._init_from_archive(pretrained_model)
        else:
            # Model parameter definition.
            #
            # Defaults reflect Dieng et al.'s decisions when training their semi-unsupervised
            # IMDB sentiment classifier.
            self.text_field_embedder = text_field_embedder
            self.vocab_size = self.vocab.get_vocab_size("tokens")
            self.text_encoder = text_encoder
            self.topic_dim = topic_dim
            self.vocabulary_projection_layer = TimeDistributed(
                Linear(text_encoder.get_output_dim(), self.vocab_size))

            # Parameter gamma from the paper; projects hidden states into binary logits for whether a
            # word is a stopword.
            self.stopword_projection_layer = TimeDistributed(
                Linear(text_encoder.get_output_dim(), 2))

            self.tokens_to_index = vocab.get_token_to_index_vocabulary()

            # This step should only ever be performed ONCE.
            # When running allennlp train, the vocabulary will be constructed before the model instantiation, but
            # we can't create the stopless namespace until we get here.
            # Check if there already exists a stopless namespace: if so refrain from altering it.
            if "stopless" not in vocab._token_to_index.keys():
                assert self.tokens_to_index[DEFAULT_PADDING_TOKEN] == 0 and \
                       self.tokens_to_index[DEFAULT_OOV_TOKEN] == 1
                for token, _ in self.tokens_to_index.items():
                    if token not in STOP_WORDS:
                        vocab.add_token_to_namespace(token, "stopless")

                # Since a vocabulary with the stopless namespace hasn't been saved, save one for convienience.
                vocab.save_to_files("vocabulary")

            # Compute stop indices in the normal vocab space to prevent stop words
            # from contributing to the topic additions.
            self.stop_indices = torch.LongTensor(
                [vocab.get_token_index(stop) for stop in STOP_WORDS])

            # Learnable topics.
            # TODO: How should these be initialized?
            self.beta = nn.Parameter(torch.rand(topic_dim, self.vocab_size))

            # mu: The mean of the variational distribution.
            self.mu_linear = nn.Linear(topic_dim, topic_dim)

            # sigma: The root standard deviation of the variational distribution.
            self.sigma_linear = nn.Linear(topic_dim, topic_dim)

            # noise: used when sampling.
            self.noise = MultivariateNormal(torch.zeros(topic_dim),
                                            torch.eye(topic_dim))

            stopless_dim = vocab.get_vocab_size("stopless")
            self.variational_autoencoder = variational_autoencoder or FeedForward(
                # Takes as input the word frequencies in the stopless dimension and projects
                # the word frequencies into a latent topic representation.
                #
                # Each latent representation will help tune the variational dist.'s parameters.
                stopless_dim,
                3,
                [500, 500, topic_dim],
                torch.nn.ReLU(),
            )

            # The shape for the feature vector for sentiment classification.
            # (RNN Hidden Size + Inference Network output dimension).
            sentiment_input_size = text_encoder.get_output_dim() + topic_dim
            self.sentiment_classifier = sentiment_classifier or FeedForward(
                # As done by the paper; a simple single layer with 50 hidden units
                # and sigmoid activation for sentiment classification.
                sentiment_input_size,
                2,
                [50, 2],
                torch.nn.Sigmoid(),
            )

        if freeze_feature_extraction:
            # Freeze the RNN and VAE pipeline so that only the classifier is trained.
            for name, param in self.named_parameters():
                if "sentiment_classifier" not in name:
                    param.requires_grad = False

        self.sentiment_criterion = nn.CrossEntropyLoss()

        self.num_samples = 50

        initializer(self)
Exemplo n.º 16
0
    def from_params(params: Params,
                    serialization_dir: str,
                    recover: bool = False) -> 'TrainerPieces':
        # all_datasets = datasets_from_params(params)
        corpus = Corpus.from_params(params.pop('corpus'))
        # datasets_for_vocab_creation = set(params.pop(
        #     "datasets_for_vocab_creation", all_datasets))

        # for dataset in datasets_for_vocab_creation:
        #     if dataset not in all_datasets:
        #         raise ConfigurationError(
        #             f"invalid 'dataset_for_vocab_creation' {dataset}")

        # logger.info("From dataset instances, %s will be considered for vocabulary creation.",
        #             ", ".join(datasets_for_vocab_creation))

        seed = params.pop_int("seed", 5678)
        vocab_params = params.pop("vocabulary", {})
        vocab_type = vocab_params.get("type", "default")
        if vocab_type == 'default' and os.path.exists(
                os.path.join(serialization_dir, "vocabulary")):
            vocab = Vocabulary.from_files(
                os.path.join(serialization_dir, "vocabulary"))
        elif vocab_type == 'empty':
            vocab = Vocabulary()
        else:
            seed_environment(seed)
            vocab = Vocabulary.from_params(vocab_params, corpus.train)

        # Need to reset the seed. Otherwise loading existing vocab and creating
        # vocab from scratch will lead to different behavior.
        seed_environment(seed)
        # contextualizer_params = params.pop('contextualizer')
        # contextualizer = Seq2SeqDecoder.from_params(
        #     vocab=vocab, params=contextualizer_params)

        model = Model.from_params(vocab=vocab, params=params.pop('model'))

        # If vocab extension is ON for training, embedding extension should also be
        # done. If vocab and embeddings are already in sync, it would be a no-op.
        model.extend_embedder_vocab()

        # Initializing the model can have side effect of expanding the vocabulary
        vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        iterator = DataIterator.from_params(params.pop("iterator"))
        iterator.index_with(model.vocab)
        validation_iterator_params = params.pop("validation_iterator", None)
        if validation_iterator_params:
            validation_iterator = DataIterator.from_params(
                validation_iterator_params)
            validation_iterator.index_with(model.vocab)
        else:
            validation_iterator = None

        # train_data = all_datasets['train']
        # validation_data = all_datasets.get('validation')
        # test_data = all_datasets.get('test')

        trainer_params = params.pop("trainer")
        no_grad_regexes = trainer_params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = get_frozen_and_tunable_parameter_names(
            model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        batch_weight_key = params.pop('batch_weight_key', '')

        return TrainerPieces(model, iterator, corpus, validation_iterator,
                             batch_weight_key, trainer_params)