def initialize_vocabulary(directory, name, datasets=None, series_ids=None, max_size=None): """This function is supposed to initialize vocabulary when called from the configuration file. It first checks whether the vocabulary is already loaded on the provided path and if not, it tries to generate it from the provided dataset. Arguments: directory: Directory where the vocabulary should be stored. name: Name of the vocabulary which is also the name of the file it is stored it. datasets: A a list of datasets from which the vocabulary can be created. series_ids: A list of ids of series of the datasets that should be used for producing the vocabulary. """ file_name = os.path.join(directory, name + ".pickle") if os.path.exists(file_name): return Vocabulary.from_pickled(file_name) else: if datasets is None or series_ids is None or max_size is None: raise Exception("Vocabulary does not exist in \"{}\","+ "neither dataset and series_id were provided.") vocabulary = Vocabulary.from_datasets(datasets, series_ids, max_size) if not os.path.exists(directory): os.makedirs(directory) vocabulary.save_to_file(file_name) return vocabulary
def test_save_and_load(self): """Try to save and load encoder.""" vocabulary = Vocabulary() vocabulary.add_word("a") vocabulary.add_word("b") checkpoint_file = tempfile.NamedTemporaryFile(delete=False) checkpoint_file.close() encoder = SentenceEncoder( "enc", Vocabulary(), "data_id", 10, 20, 30, save_checkpoint=checkpoint_file.name, load_checkpoint=checkpoint_file.name) encoders_variables = tf.get_collection( tf.GraphKeys.VARIABLES, scope="enc") sess_1 = tf.Session() sess_1.run(tf.initialize_all_variables()) encoder.save(sess_1) sess_2 = tf.Session() sess_2.run(tf.initialize_all_variables()) encoder.load(sess_2) values_in_sess_1 = sess_1.run(encoders_variables) values_in_sess_2 = sess_2.run(encoders_variables) self.assertTrue( all(np.all(v1 == v2) for v1, v2 in zip(values_in_sess_1, values_in_sess_2))) os.remove(checkpoint_file.name)
class Word2Vec: def __init__(self, path: str, encoding: str = "utf-8") -> None: """Load the word2vec file.""" check_argument_types() # Create the vocabulary object, load the words and vectors from the # file self.vocab = Vocabulary() embedding_vectors = [] # type: List[np.ndarray] with open(path, encoding=encoding) as f_data: header = next(f_data) emb_size = int(header.split()[1]) # Add zero embeddings for padding, start, and end token embedding_vectors.append(np.zeros(emb_size)) embedding_vectors.append(np.zeros(emb_size)) embedding_vectors.append(np.zeros(emb_size)) # Add placeholder for embedding of the unknown symbol embedding_vectors.append(None) for line in f_data: fields = line.split() word = fields[0] vector = np.fromiter((float(x) for x in fields[1:]), dtype=np.float) assert vector.shape[0] == emb_size # Embedding of unknown token should be at index 3 to match the # vocabulary implementation if is_special_token(word): embedding_vectors[SPECIAL_TOKENS.index(word)] = vector else: self.vocab.add_word(word) embedding_vectors.append(vector) assert embedding_vectors[3] is not None assert emb_size is not None self.embedding_matrix = np.stack(embedding_vectors) @property def vocabulary(self) -> Vocabulary: """Get a vocabulary object generated from this word2vec instance.""" return self.vocab @property def embeddings(self) -> np.ndarray: """Get the embedding matrix.""" return self.embedding_matrix
def test_save_and_load(self): """Try to save and load encoder.""" vocabulary = Vocabulary() vocabulary.add_word("a") vocabulary.add_word("b") checkpoint_file = tempfile.NamedTemporaryFile(delete=False) checkpoint_file.close() encoder = SentenceEncoder(name="enc", vocabulary=Vocabulary(), data_id="data_id", embedding_size=10, rnn_size=20, max_input_len=30, save_checkpoint=checkpoint_file.name, load_checkpoint=checkpoint_file.name) encoder.input_sequence.register_input() # NOTE: This assert needs to be here otherwise the model has # no parameters since the sentence encoder is initialized lazily self.assertIsInstance(encoder.temporal_states, tf.Tensor) encoders_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="enc") sess_1 = tf.Session() sess_1.run(tf.global_variables_initializer()) encoder.save(sess_1) sess_2 = tf.Session() sess_2.run(tf.global_variables_initializer()) encoder.load(sess_2) values_in_sess_1 = sess_1.run(encoders_variables) values_in_sess_2 = sess_2.run(encoders_variables) self.assertTrue( all( np.all(v1 == v2) for v1, v2 in zip(values_in_sess_1, values_in_sess_2))) os.remove(checkpoint_file.name)
def setUp(self): self.decoder_params = dict( encoders=[], vocabulary=Vocabulary(["a", "b", "c"]), data_id="foo", name="test-decoder", max_output_len=5, dropout_keep_prob=1.0, embedding_size=10, rnn_size=10)
def test_init(self): decoder = Decoder(encoders=[], vocabulary=Vocabulary(), data_id="foo", name="test-decoder", max_output_len=5, dropout_keep_prob=1.0, embedding_size=10, rnn_size=10) self.assertIsNotNone(decoder)
def test_reuse(self): vocabulary = Vocabulary() vocabulary.add_word("a") vocabulary.add_word("b") seq1 = EmbeddedSequence(name="seq1", vocabulary=vocabulary, data_id="id", embedding_size=10) seq1.register_input() seq2 = EmbeddedSequence(name="seq2", vocabulary=vocabulary, embedding_size=10, data_id="id") seq2.register_input() seq3 = EmbeddedSequence(name="seq3", vocabulary=vocabulary, data_id="id", embedding_size=10, reuse=seq1) seq3.register_input() # blessing self.assertIsNotNone(seq1.embedding_matrix) self.assertIsNotNone(seq2.embedding_matrix) self.assertIsNotNone(seq3.embedding_matrix) sess = tf.Session() sess.run(tf.global_variables_initializer()) params = sess.run((seq1.embedding_matrix, seq2.embedding_matrix, seq3.embedding_matrix)) with self.assertRaises(AssertionError): assert_array_equal(params[0], params[1]) assert_array_equal(params[0], params[2])
def test_min_freq(self): vocabulary = Vocabulary() vocabulary.correct_counts = True for sentence in TOKENIZED_CORPUS: vocabulary.add_tokenized_text(sentence) vocabulary.truncate_by_min_freq(2) self.assertTrue("walrus" in vocabulary) self.assertFalse("colorless" in vocabulary)
def test_count_fail(self): vocabulary = Vocabulary() for sentence in TOKENIZED_CORPUS: vocabulary.add_tokenized_text(sentence) with self.assertRaises(ValueError): vocabulary.truncate_by_min_freq(2)
def setUpClass(cls): tf.reset_default_graph() cls.corpus = [ "the colorless ideas slept furiously", "pooh slept all night", "working class hero is something to be", "I am the working class walrus", "walrus for president" ] cls.graph = tf.Graph() with cls.graph.as_default(): cls.tokenized_corpus = [s.split(" ") for s in cls.corpus] words = [w for sent in cls.tokenized_corpus for w in sent] cls.vocabulary = Vocabulary(list(set(words)))
def __init__(self, path: str, encoding: str = "utf-8") -> None: """Load the word2vec file. Args: path: Path to word embeddings stored in the word2vec format. encoding: File encoding. """ check_argument_types() # Create the vocabulary object, load the words and vectors from the # file words = [] # List[str] embedding_vectors = [] # type: List[np.ndarray] with open(path, encoding=encoding) as f_data: header = next(f_data) emb_size = int(header.split()[1]) # Add zero embeddings for padding, start, end, and unk token for _ in SPECIAL_TOKENS: embedding_vectors.append(np.zeros(emb_size)) for line in f_data: fields = line.split() word = fields[0] vector = np.fromiter((float(x) for x in fields[1:]), dtype=np.float) assert vector.shape[0] == emb_size # Embedding of unknown token should be at index 3 to match the # vocabulary implementation if word in SPECIAL_TOKENS: embedding_vectors[SPECIAL_TOKENS.index(word)] = vector else: words.append(word) embedding_vectors.append(vector) self.vocab = Vocabulary(words) assert emb_size is not None self.embedding_matrix = np.stack(embedding_vectors)
def setUpClass(cls): corpus = [ "the colorless ideas slept furiously", "pooh slept all night", "working class hero is something to be", "I am the working class walrus", "walrus for president" ] tokenized_corpus = [[a + "_" for a in s.split()] for s in corpus] vocab_from_corpus = {w for sent in tokenized_corpus for w in sent} # Create list of characters required to process the CORPUS with # wordpieces corpus_chars = {x for c in set("".join(corpus)) for x in [c, c + "_"]} escape_chars = "\\_u0987654321;" c_caron = "\\269;" a_acute = "225" words = corpus_chars | set(escape_chars) | vocab_from_corpus vocabulary = Vocabulary(list(words) + [c_caron, a_acute]) cls.preprocessor = WordpiecePreprocessor(vocabulary) cls.postprocessor = WordpiecePostprocessor
import unittest from neuralmonkey.vocabulary import Vocabulary CORPUS = [ "the colorless ideas slept furiously", "pooh slept all night", "working class hero is something to be", "I am the working class walrus", "walrus for president" ] TOKENIZED_CORPUS = [s.split(" ") for s in CORPUS] VOCABULARY = Vocabulary() for s in TOKENIZED_CORPUS: VOCABULARY.add_tokenized_text(s) class TestVacabulary(unittest.TestCase): def test_all_words_in(self): for sentence in TOKENIZED_CORPUS: for word in sentence: self.assertTrue(word in VOCABULARY) def test_unknown_word(self): self.assertFalse("jindrisek" in VOCABULARY) def test_padding(self): pass
#!/usr/bin/env python3.5 """Test init methods of encoders.""" import unittest import copy from typing import Dict, List, Any, Iterable from neuralmonkey.encoders.recurrent import SentenceEncoder from neuralmonkey.encoders.sentence_cnn_encoder import SentenceCNNEncoder from neuralmonkey.model.sequence import EmbeddedSequence from neuralmonkey.vocabulary import Vocabulary VOCABULARY = Vocabulary(["ich", "bin", "der", "walrus"]) INPUT_SEQUENCE = EmbeddedSequence("seq", VOCABULARY, "marmelade", 300) SENTENCE_ENCODER_GOOD = { "name": ["encoder"], "vocabulary": [VOCABULARY], "data_id": ["marmelade"], "embedding_size": [20], "rnn_size": [30], "max_input_len": [None, 15], "dropout_keep_prob": [0.5, 1.], } SENTENCE_ENCODER_BAD = { "nonexistent": ["ahoj"], "name": [None, 1], "vocabulary": [0, None, "ahoj", dict()], "data_id": [0, None, VOCABULARY],
#!/usr/bin/env python3.5 # -*- coding: utf-8 -*- """ Unit tests for the decoder. (Tests only initialization so far) """ import unittest import copy from neuralmonkey.decoders.decoder import Decoder from neuralmonkey.vocabulary import Vocabulary DECODER_PARAMS = dict(encoders=[], vocabulary=Vocabulary(), data_id="foo", name="test-decoder", max_output_len=5, dropout_keep_prob=1.0, embedding_size=10, rnn_size=10) class TestDecoder(unittest.TestCase): def test_init(self): decoder = Decoder(**DECODER_PARAMS) self.assertIsNotNone(decoder) def test_max_output_len(self): dparams = copy.deepcopy(DECODER_PARAMS) dparams["max_output_len"] = -10 with self.assertRaises(ValueError): Decoder(**dparams)
def test_init(self): decoder = Decoder([], Vocabulary(), "foo", "test-decoder") self.assertIsNotNone(decoder)
#!/usr/bin/env python3.5 import unittest from neuralmonkey.vocabulary import Vocabulary CORPUS = [ "the colorless ideas slept furiously", "pooh slept all night", "working class hero is something to be", "I am the working class walrus", "walrus for president" ] TOKENIZED_CORPUS = [s.split(" ") for s in CORPUS] VOCABULARY = Vocabulary() for s in TOKENIZED_CORPUS: VOCABULARY.add_tokenized_text(s) class TestVocabulary(unittest.TestCase): def test_all_words_in(self): for sentence in TOKENIZED_CORPUS: for word in sentence: self.assertTrue(word in VOCABULARY) def test_unknown_word(self): self.assertFalse("jindrisek" in VOCABULARY) def test_padding(self): pass