Python LanguageModelingDataset 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nalp.datasets

hotexamples.com에서의 예제들: 3

Python LanguageModelingDataset - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nalp.datasets.LanguageModelingDataset에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

LanguageModelingDataset(3)

자주 사용되는 메소드들

LanguageModelingDataset (3)

예제 #1

파일 보기

from nalp.models.generators import LSTMGenerator

# Creating a character TextCorpus from file
corpus = TextCorpus(from_file='data/text/chapter1_harry.txt', corpus_type='char')

# Creating an IntegerEncoder
encoder = IntegerEncoder()

# Learns the encoding based on the TextCorpus dictionary and reverse dictionary
encoder.learn(corpus.vocab_index, corpus.index_vocab)

# Applies the encoding on new data
encoded_tokens = encoder.encode(corpus.tokens)

# Creating Language Modeling Dataset
dataset = LanguageModelingDataset(encoded_tokens, max_length=10, batch_size=64)

# Creating the LSTM
lstm = LSTMGenerator(encoder=encoder, vocab_size=corpus.vocab_size, embedding_size=256, hidden_size=512)

# As NALP's LSTMs are stateful, we need to build it with a fixed batch size
lstm.build((64, None))

# Compiling the LSTM
lstm.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001),
            loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=[tf.metrics.SparseCategoricalAccuracy(name='accuracy')])

# Fitting the LSTM
lstm.fit(dataset.batches, epochs=100)

예제 #2

파일 보기

from nalp.corpus import TextCorpus
from nalp.datasets import LanguageModelingDataset
from nalp.encoders import IntegerEncoder

# Creating a character TextCorpus from file
corpus = TextCorpus(from_file='data/text/chapter1_harry.txt',
                    corpus_type='char')

# Creating an IntegerEncoder, learning encoding and encoding tokens
encoder = IntegerEncoder()
encoder.learn(corpus.vocab_index, corpus.index_vocab)
encoded_tokens = encoder.encode(corpus.tokens)

# Creating Language Modeling Dataset
dataset = LanguageModelingDataset(encoded_tokens,
                                  max_contiguous_pad_length=10,
                                  batch_size=1,
                                  shuffle=True)

# Iterating over one batch
for input_batch, target_batch in dataset.batches.take(1):
    # For every input and target inside the batch
    for x, y in zip(input_batch, target_batch):
        # Transforms the tensor to numpy and decodes it
        print(encoder.decode(x.numpy()), encoder.decode(y.numpy()))

예제 #3

파일 보기

파일: recurrent_audio_generation.py 프로젝트: gugarosa/nalp

from nalp.corpus import AudioCorpus
from nalp.datasets import LanguageModelingDataset
from nalp.encoders import IntegerEncoder
from nalp.models.generators import RNNGenerator

# Creating an AudioCorpus from file
corpus = AudioCorpus(from_file='data/audio/sample.mid')

# Creating an IntegerEncoder, learning encoding and encoding tokens
encoder = IntegerEncoder()
encoder.learn(corpus.vocab_index, corpus.index_vocab)
encoded_tokens = encoder.encode(corpus.tokens)

# Creating Language Modeling Dataset
dataset = LanguageModelingDataset(encoded_tokens,
                                  max_contiguous_pad_length=100,
                                  batch_size=64)

# Creating the RNN
rnn = RNNGenerator(encoder=encoder,
                   vocab_size=corpus.vocab_size,
                   embedding_size=256,
                   hidden_size=512)

# As NALP's RNNs are stateful, we need to build it with a fixed batch size
rnn.build((64, None))

# Compiling the RNN
rnn.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001),
            loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=[tf.metrics.SparseCategoricalAccuracy(name='accuracy')])