Пример #1
0
# create work dir
work_dir = os.path.join(os.path.expanduser('~'), 'brocas_models')
lm_file = os.path.join(work_dir, 'test_model.bin')
if not os.path.exists(work_dir):
    os.makedirs(work_dir)

# get text corpus
nltk.download('brown')
sents = nltk.corpus.brown.sents()[:100]

# preprocessing
normalizer = Normalization(sents, min_count=15)
training_data = NormalizationIter(normalizer, sents)
lm = LanguageModel(tokenized_sentences=training_data,
                   input_layer_size=64,
                   hidden_layer_size=128)
print()

# train model
lm.train(training_data, epochs=5, backup_directory=work_dir, log_interval=20)
print()

# test trained model
normalized_sentence = normalizer.normalize(sents[0])
print('normalized sentence:')
print(' '.join(normalized_sentence))
print('probability: ', lm.sentence_log_probability(normalized_sentence))
print()
start_tag = normalized_sentence[0]
end_tag = normalized_sentence[-1]
Пример #2
0
from brocas_lm.model import LanguageModel

# create work dir
work_dir = os.path.join(os.path.expanduser('~'), 'brocas_models')
lm_file = os.path.join(work_dir, 'test_model.bin')
if not os.path.exists(work_dir):
    os.makedirs(work_dir)

# get text corpus
nltk.download('brown')
sents = nltk.corpus.brown.sents()[:100]

# preprocessing
normalizer = Normalization(sents, min_count=15)
training_data = NormalizationIter(normalizer, sents)
lm = LanguageModel(tokenized_sentences=training_data, input_layer_size=64, hidden_layer_size=128)
print()

# train model
lm.train(training_data, epochs=5, backup_directory=work_dir, log_interval=20)
print()

# test trained model
normalized_sentence = normalizer.normalize(sents[0])
print('normalized sentence:')
print(' '.join(normalized_sentence))
print('probability: ', lm.sentence_log_probability(normalized_sentence))
print()
start_tag = normalized_sentence[0]
end_tag = normalized_sentence[-1]
print('sample:')
Пример #3
0
                           min_count=20,
                           start_tag='S',
                           end_tag='E',
                           unknown_tag='U',
                           digit_tag='D')

all_sents_normalized = NormalizationIter(normalizer, all_sents)

cs1 = ['than', 'then']
cs2 = ['except', 'accept']
cs3 = ['well', 'good']

acs = AdvancedCorpusSplitter(all_sents_normalized, cs1 + cs2 + cs3)

if os.path.isfile(lm_file):
    lm = LanguageModel(lm_file=lm_file)

else:
    lm = LanguageModel(verbose=True,
                       tokenized_sentences=acs,
                       input_layer_size=128,
                       hidden_layer_size=512)

    cost_log = lm.train(acs,
                        epochs=10,
                        backup_directory=work_dir,
                        return_cost=True,
                        log_interval=1000)
    lm.save(lm_file)

# sampling