Exemplo n.º 1
0
def initialize_vocabulary(directory, name, datasets=None, series_ids=None,
                          max_size=None):
    """This function is supposed to initialize vocabulary when called from the
    configuration file. It first checks whether the vocabulary is already
    loaded on the provided path and if not, it tries to generate it from
    the provided dataset.

    Arguments:
        directory: Directory where the vocabulary should be stored.
        name: Name of the vocabulary which is also the name of the file
              it is stored it.
        datasets: A a list of datasets from which the vocabulary can be
                  created.
        series_ids: A list of ids of series of the datasets that should be used
                    for producing the vocabulary.
    """
    file_name = os.path.join(directory, name + ".pickle")
    if os.path.exists(file_name):
        return Vocabulary.from_pickled(file_name)
    else:
        if datasets is None or series_ids is None or max_size is None:
            raise Exception("Vocabulary does not exist in \"{}\","+
                            "neither dataset and series_id were provided.")
        vocabulary = Vocabulary.from_datasets(datasets, series_ids, max_size)

        if not os.path.exists(directory):
            os.makedirs(directory)

        vocabulary.save_to_file(file_name)
        return vocabulary
Exemplo n.º 2
0
    def test_save_and_load(self):
        """Try to save and load encoder."""
        vocabulary = Vocabulary()
        vocabulary.add_word("a")
        vocabulary.add_word("b")

        checkpoint_file = tempfile.NamedTemporaryFile(delete=False)
        checkpoint_file.close()

        encoder = SentenceEncoder(
            "enc", Vocabulary(), "data_id", 10, 20, 30,
            save_checkpoint=checkpoint_file.name,
            load_checkpoint=checkpoint_file.name)

        encoders_variables = tf.get_collection(
            tf.GraphKeys.VARIABLES, scope="enc")

        sess_1 = tf.Session()
        sess_1.run(tf.initialize_all_variables())
        encoder.save(sess_1)

        sess_2 = tf.Session()
        sess_2.run(tf.initialize_all_variables())
        encoder.load(sess_2)

        values_in_sess_1 = sess_1.run(encoders_variables)
        values_in_sess_2 = sess_2.run(encoders_variables)

        self.assertTrue(
            all(np.all(v1 == v2) for v1, v2 in
                zip(values_in_sess_1, values_in_sess_2)))

        os.remove(checkpoint_file.name)
Exemplo n.º 3
0
class Word2Vec:
    def __init__(self, path: str, encoding: str = "utf-8") -> None:
        """Load the word2vec file."""
        check_argument_types()

        # Create the vocabulary object, load the words and vectors from the
        # file

        self.vocab = Vocabulary()
        embedding_vectors = []  # type: List[np.ndarray]

        with open(path, encoding=encoding) as f_data:

            header = next(f_data)
            emb_size = int(header.split()[1])

            # Add zero embeddings for padding, start, and end token
            embedding_vectors.append(np.zeros(emb_size))
            embedding_vectors.append(np.zeros(emb_size))
            embedding_vectors.append(np.zeros(emb_size))
            # Add placeholder for embedding of the unknown symbol
            embedding_vectors.append(None)

            for line in f_data:
                fields = line.split()
                word = fields[0]
                vector = np.fromiter((float(x) for x in fields[1:]),
                                     dtype=np.float)

                assert vector.shape[0] == emb_size

                # Embedding of unknown token should be at index 3 to match the
                # vocabulary implementation
                if is_special_token(word):
                    embedding_vectors[SPECIAL_TOKENS.index(word)] = vector
                else:
                    self.vocab.add_word(word)
                    embedding_vectors.append(vector)

        assert embedding_vectors[3] is not None
        assert emb_size is not None

        self.embedding_matrix = np.stack(embedding_vectors)

    @property
    def vocabulary(self) -> Vocabulary:
        """Get a vocabulary object generated from this word2vec instance."""
        return self.vocab

    @property
    def embeddings(self) -> np.ndarray:
        """Get the embedding matrix."""
        return self.embedding_matrix
Exemplo n.º 4
0
    def test_save_and_load(self):
        """Try to save and load encoder."""
        vocabulary = Vocabulary()
        vocabulary.add_word("a")
        vocabulary.add_word("b")

        checkpoint_file = tempfile.NamedTemporaryFile(delete=False)
        checkpoint_file.close()

        encoder = SentenceEncoder(name="enc",
                                  vocabulary=Vocabulary(),
                                  data_id="data_id",
                                  embedding_size=10,
                                  rnn_size=20,
                                  max_input_len=30,
                                  save_checkpoint=checkpoint_file.name,
                                  load_checkpoint=checkpoint_file.name)

        encoder.input_sequence.register_input()

        # NOTE: This assert needs to be here otherwise the model has
        # no parameters since the sentence encoder is initialized lazily
        self.assertIsInstance(encoder.temporal_states, tf.Tensor)

        encoders_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope="enc")

        sess_1 = tf.Session()
        sess_1.run(tf.global_variables_initializer())
        encoder.save(sess_1)

        sess_2 = tf.Session()
        sess_2.run(tf.global_variables_initializer())
        encoder.load(sess_2)

        values_in_sess_1 = sess_1.run(encoders_variables)
        values_in_sess_2 = sess_2.run(encoders_variables)

        self.assertTrue(
            all(
                np.all(v1 == v2)
                for v1, v2 in zip(values_in_sess_1, values_in_sess_2)))

        os.remove(checkpoint_file.name)
Exemplo n.º 5
0
 def setUp(self):
     self.decoder_params = dict(
         encoders=[],
         vocabulary=Vocabulary(["a", "b", "c"]),
         data_id="foo",
         name="test-decoder",
         max_output_len=5,
         dropout_keep_prob=1.0,
         embedding_size=10,
         rnn_size=10)
Exemplo n.º 6
0
 def test_init(self):
     decoder = Decoder(encoders=[],
                       vocabulary=Vocabulary(),
                       data_id="foo",
                       name="test-decoder",
                       max_output_len=5,
                       dropout_keep_prob=1.0,
                       embedding_size=10,
                       rnn_size=10)
     self.assertIsNotNone(decoder)
Exemplo n.º 7
0
    def test_reuse(self):
        vocabulary = Vocabulary()
        vocabulary.add_word("a")
        vocabulary.add_word("b")

        seq1 = EmbeddedSequence(name="seq1",
                                vocabulary=vocabulary,
                                data_id="id",
                                embedding_size=10)
        seq1.register_input()

        seq2 = EmbeddedSequence(name="seq2",
                                vocabulary=vocabulary,
                                embedding_size=10,
                                data_id="id")
        seq2.register_input()

        seq3 = EmbeddedSequence(name="seq3",
                                vocabulary=vocabulary,
                                data_id="id",
                                embedding_size=10,
                                reuse=seq1)
        seq3.register_input()

        # blessing
        self.assertIsNotNone(seq1.embedding_matrix)
        self.assertIsNotNone(seq2.embedding_matrix)
        self.assertIsNotNone(seq3.embedding_matrix)

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        params = sess.run((seq1.embedding_matrix, seq2.embedding_matrix,
                           seq3.embedding_matrix))

        with self.assertRaises(AssertionError):
            assert_array_equal(params[0], params[1])

        assert_array_equal(params[0], params[2])
    def test_min_freq(self):

        vocabulary = Vocabulary()
        vocabulary.correct_counts = True

        for sentence in TOKENIZED_CORPUS:
            vocabulary.add_tokenized_text(sentence)

        vocabulary.truncate_by_min_freq(2)

        self.assertTrue("walrus" in vocabulary)
        self.assertFalse("colorless" in vocabulary)
    def test_count_fail(self):

        vocabulary = Vocabulary()

        for sentence in TOKENIZED_CORPUS:
            vocabulary.add_tokenized_text(sentence)

        with self.assertRaises(ValueError):
            vocabulary.truncate_by_min_freq(2)
Exemplo n.º 10
0
    def setUpClass(cls):
        tf.reset_default_graph()

        cls.corpus = [
            "the colorless ideas slept furiously",
            "pooh slept all night",
            "working class hero is something to be",
            "I am the working class walrus",
            "walrus for president"
        ]

        cls.graph = tf.Graph()

        with cls.graph.as_default():
            cls.tokenized_corpus = [s.split(" ") for s in cls.corpus]
            words = [w for sent in cls.tokenized_corpus for w in sent]
            cls.vocabulary = Vocabulary(list(set(words)))
Exemplo n.º 11
0
    def __init__(self, path: str, encoding: str = "utf-8") -> None:
        """Load the word2vec file.

        Args:
            path: Path to word embeddings stored in the word2vec format.
            encoding: File encoding.
        """
        check_argument_types()

        # Create the vocabulary object, load the words and vectors from the
        # file

        words = []  # List[str]
        embedding_vectors = []  # type: List[np.ndarray]

        with open(path, encoding=encoding) as f_data:

            header = next(f_data)
            emb_size = int(header.split()[1])

            # Add zero embeddings for padding, start, end, and unk token
            for _ in SPECIAL_TOKENS:
                embedding_vectors.append(np.zeros(emb_size))

            for line in f_data:
                fields = line.split()
                word = fields[0]
                vector = np.fromiter((float(x) for x in fields[1:]),
                                     dtype=np.float)

                assert vector.shape[0] == emb_size

                # Embedding of unknown token should be at index 3 to match the
                # vocabulary implementation
                if word in SPECIAL_TOKENS:
                    embedding_vectors[SPECIAL_TOKENS.index(word)] = vector
                else:
                    words.append(word)
                    embedding_vectors.append(vector)

        self.vocab = Vocabulary(words)

        assert emb_size is not None

        self.embedding_matrix = np.stack(embedding_vectors)
Exemplo n.º 12
0
    def setUpClass(cls):
        corpus = [
            "the colorless ideas slept furiously", "pooh slept all night",
            "working class hero is something to be",
            "I am the working class walrus", "walrus for president"
        ]

        tokenized_corpus = [[a + "_" for a in s.split()] for s in corpus]
        vocab_from_corpus = {w for sent in tokenized_corpus for w in sent}

        # Create list of characters required to process the CORPUS with
        # wordpieces
        corpus_chars = {x for c in set("".join(corpus)) for x in [c, c + "_"]}
        escape_chars = "\\_u0987654321;"
        c_caron = "\\269;"
        a_acute = "225"

        words = corpus_chars | set(escape_chars) | vocab_from_corpus
        vocabulary = Vocabulary(list(words) + [c_caron, a_acute])

        cls.preprocessor = WordpiecePreprocessor(vocabulary)
        cls.postprocessor = WordpiecePostprocessor
Exemplo n.º 13
0
import unittest

from neuralmonkey.vocabulary import Vocabulary

CORPUS = [
    "the colorless ideas slept furiously",
    "pooh slept all night",
    "working class hero is something to be",
    "I am the working class walrus",
    "walrus for president"
]

TOKENIZED_CORPUS = [s.split(" ") for s in CORPUS]

VOCABULARY = Vocabulary()

for s in TOKENIZED_CORPUS:
    VOCABULARY.add_tokenized_text(s)

class TestVacabulary(unittest.TestCase):
    def test_all_words_in(self):
        for sentence in TOKENIZED_CORPUS:
            for word in sentence:
                self.assertTrue(word in VOCABULARY)

    def test_unknown_word(self):
        self.assertFalse("jindrisek" in VOCABULARY)

    def test_padding(self):
        pass
Exemplo n.º 14
0
#!/usr/bin/env python3.5
"""Test init methods of encoders."""

import unittest
import copy

from typing import Dict, List, Any, Iterable

from neuralmonkey.encoders.recurrent import SentenceEncoder
from neuralmonkey.encoders.sentence_cnn_encoder import SentenceCNNEncoder
from neuralmonkey.model.sequence import EmbeddedSequence
from neuralmonkey.vocabulary import Vocabulary

VOCABULARY = Vocabulary(["ich", "bin", "der", "walrus"])
INPUT_SEQUENCE = EmbeddedSequence("seq", VOCABULARY, "marmelade", 300)

SENTENCE_ENCODER_GOOD = {
    "name": ["encoder"],
    "vocabulary": [VOCABULARY],
    "data_id": ["marmelade"],
    "embedding_size": [20],
    "rnn_size": [30],
    "max_input_len": [None, 15],
    "dropout_keep_prob": [0.5, 1.],
}

SENTENCE_ENCODER_BAD = {
    "nonexistent": ["ahoj"],
    "name": [None, 1],
    "vocabulary": [0, None, "ahoj", dict()],
    "data_id": [0, None, VOCABULARY],
#!/usr/bin/env python3.5
# -*- coding: utf-8 -*-
""" Unit tests for the decoder. (Tests only initialization so far) """

import unittest
import copy

from neuralmonkey.decoders.decoder import Decoder
from neuralmonkey.vocabulary import Vocabulary

DECODER_PARAMS = dict(encoders=[],
                      vocabulary=Vocabulary(),
                      data_id="foo",
                      name="test-decoder",
                      max_output_len=5,
                      dropout_keep_prob=1.0,
                      embedding_size=10,
                      rnn_size=10)


class TestDecoder(unittest.TestCase):
    def test_init(self):
        decoder = Decoder(**DECODER_PARAMS)
        self.assertIsNotNone(decoder)

    def test_max_output_len(self):
        dparams = copy.deepcopy(DECODER_PARAMS)

        dparams["max_output_len"] = -10
        with self.assertRaises(ValueError):
            Decoder(**dparams)
Exemplo n.º 16
0
 def test_init(self):
     decoder = Decoder([], Vocabulary(), "foo", "test-decoder")
     self.assertIsNotNone(decoder)
Exemplo n.º 17
0
#!/usr/bin/env python3.5

import unittest

from neuralmonkey.vocabulary import Vocabulary

CORPUS = [
    "the colorless ideas slept furiously", "pooh slept all night",
    "working class hero is something to be", "I am the working class walrus",
    "walrus for president"
]

TOKENIZED_CORPUS = [s.split(" ") for s in CORPUS]

VOCABULARY = Vocabulary()

for s in TOKENIZED_CORPUS:
    VOCABULARY.add_tokenized_text(s)


class TestVocabulary(unittest.TestCase):
    def test_all_words_in(self):
        for sentence in TOKENIZED_CORPUS:
            for word in sentence:
                self.assertTrue(word in VOCABULARY)

    def test_unknown_word(self):
        self.assertFalse("jindrisek" in VOCABULARY)

    def test_padding(self):
        pass