# create work dir work_dir = os.path.join(os.path.expanduser('~'), 'brocas_models') lm_file = os.path.join(work_dir, 'test_model.bin') if not os.path.exists(work_dir): os.makedirs(work_dir) # get text corpus nltk.download('brown') sents = nltk.corpus.brown.sents()[:100] # preprocessing normalizer = Normalization(sents, min_count=15) training_data = NormalizationIter(normalizer, sents) lm = LanguageModel(tokenized_sentences=training_data, input_layer_size=64, hidden_layer_size=128) print() # train model lm.train(training_data, epochs=5, backup_directory=work_dir, log_interval=20) print() # test trained model normalized_sentence = normalizer.normalize(sents[0]) print('normalized sentence:') print(' '.join(normalized_sentence)) print('probability: ', lm.sentence_log_probability(normalized_sentence)) print() start_tag = normalized_sentence[0] end_tag = normalized_sentence[-1]
from brocas_lm.model import LanguageModel # create work dir work_dir = os.path.join(os.path.expanduser('~'), 'brocas_models') lm_file = os.path.join(work_dir, 'test_model.bin') if not os.path.exists(work_dir): os.makedirs(work_dir) # get text corpus nltk.download('brown') sents = nltk.corpus.brown.sents()[:100] # preprocessing normalizer = Normalization(sents, min_count=15) training_data = NormalizationIter(normalizer, sents) lm = LanguageModel(tokenized_sentences=training_data, input_layer_size=64, hidden_layer_size=128) print() # train model lm.train(training_data, epochs=5, backup_directory=work_dir, log_interval=20) print() # test trained model normalized_sentence = normalizer.normalize(sents[0]) print('normalized sentence:') print(' '.join(normalized_sentence)) print('probability: ', lm.sentence_log_probability(normalized_sentence)) print() start_tag = normalized_sentence[0] end_tag = normalized_sentence[-1] print('sample:')
min_count=20, start_tag='S', end_tag='E', unknown_tag='U', digit_tag='D') all_sents_normalized = NormalizationIter(normalizer, all_sents) cs1 = ['than', 'then'] cs2 = ['except', 'accept'] cs3 = ['well', 'good'] acs = AdvancedCorpusSplitter(all_sents_normalized, cs1 + cs2 + cs3) if os.path.isfile(lm_file): lm = LanguageModel(lm_file=lm_file) else: lm = LanguageModel(verbose=True, tokenized_sentences=acs, input_layer_size=128, hidden_layer_size=512) cost_log = lm.train(acs, epochs=10, backup_directory=work_dir, return_cost=True, log_interval=1000) lm.save(lm_file) # sampling