from nalp.models.generators import LSTMGenerator # Creating a character TextCorpus from file corpus = TextCorpus(from_file='data/text/chapter1_harry.txt', corpus_type='char') # Creating an IntegerEncoder encoder = IntegerEncoder() # Learns the encoding based on the TextCorpus dictionary and reverse dictionary encoder.learn(corpus.vocab_index, corpus.index_vocab) # Applies the encoding on new data encoded_tokens = encoder.encode(corpus.tokens) # Creating Language Modeling Dataset dataset = LanguageModelingDataset(encoded_tokens, max_length=10, batch_size=64) # Creating the LSTM lstm = LSTMGenerator(encoder=encoder, vocab_size=corpus.vocab_size, embedding_size=256, hidden_size=512) # As NALP's LSTMs are stateful, we need to build it with a fixed batch size lstm.build((64, None)) # Compiling the LSTM lstm.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.metrics.SparseCategoricalAccuracy(name='accuracy')]) # Fitting the LSTM lstm.fit(dataset.batches, epochs=100)
from nalp.corpus import TextCorpus from nalp.datasets import LanguageModelingDataset from nalp.encoders import IntegerEncoder # Creating a character TextCorpus from file corpus = TextCorpus(from_file='data/text/chapter1_harry.txt', corpus_type='char') # Creating an IntegerEncoder, learning encoding and encoding tokens encoder = IntegerEncoder() encoder.learn(corpus.vocab_index, corpus.index_vocab) encoded_tokens = encoder.encode(corpus.tokens) # Creating Language Modeling Dataset dataset = LanguageModelingDataset(encoded_tokens, max_contiguous_pad_length=10, batch_size=1, shuffle=True) # Iterating over one batch for input_batch, target_batch in dataset.batches.take(1): # For every input and target inside the batch for x, y in zip(input_batch, target_batch): # Transforms the tensor to numpy and decodes it print(encoder.decode(x.numpy()), encoder.decode(y.numpy()))
from nalp.corpus import AudioCorpus from nalp.datasets import LanguageModelingDataset from nalp.encoders import IntegerEncoder from nalp.models.generators import RNNGenerator # Creating an AudioCorpus from file corpus = AudioCorpus(from_file='data/audio/sample.mid') # Creating an IntegerEncoder, learning encoding and encoding tokens encoder = IntegerEncoder() encoder.learn(corpus.vocab_index, corpus.index_vocab) encoded_tokens = encoder.encode(corpus.tokens) # Creating Language Modeling Dataset dataset = LanguageModelingDataset(encoded_tokens, max_contiguous_pad_length=100, batch_size=64) # Creating the RNN rnn = RNNGenerator(encoder=encoder, vocab_size=corpus.vocab_size, embedding_size=256, hidden_size=512) # As NALP's RNNs are stateful, we need to build it with a fixed batch size rnn.build((64, None)) # Compiling the RNN rnn.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.metrics.SparseCategoricalAccuracy(name='accuracy')])