def test_models_have_correct_lambda_size(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) for i in range(0, lm.n - 2): model = lm.models[i] assert len(model.lambdas) == len(model.hist_words_dct)
def test_models_have_correct_n(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) for i in range(0, lm.n - 2): model = lm.models[i] assert model.n == i + 2
def test_perplexity_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) perp = round(lm.perplexity(2, math.log(0.5)), 5) correct = round(math.sqrt(2), 5) assert perp == correct
def test_kn_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.kn_evaluate(['text', 'shall', 'train']) == -2.0770634192748685 assert lm.kn_evaluate(['this', 'text', 'dog']) == -3.1656313103493887 assert lm.kn_evaluate(['the', 'brown', 'cat']) == -2.4724841297894433
def __init__(self): self.lm = LanguageModel('RenMinData.txt') self.dict = {} self.words = [] self.max_len_word = 0 self.load_dict('dict.txt') self.graph = None self.viterbi_cache = {}
def test_models_have_correct_beginning_grams(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert sorted(lm.models[0].beginning_grams) \ == sorted(['this', 'shall', 'PAD']) assert sorted(lm.models[1].beginning_grams) \ == sorted(['PAD this', 'this text', 'PAD PAD', 'shall train'])
def test_laplace_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.laplace_evaluate(['this', 'shall', 'train', 'PAD']) \ == -2.890371757896165 assert lm.laplace_evaluate(['dog', 'text', '.', 'PAD']) \ == (math.log(1 / 9) + math.log(1 / 2))
def __init__(self): super().__init__() self.model_lm = LanguageModel() self.model_ct = ContentTransfer() self.kb = KnowledgeBase() self.ranker = Ranker(self.model_lm) self.local = True
def test_train_creates_expected_hist_words_dict(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] assert sorted(list(model.hist_words_dct.keys())) \ == sorted(['PAD', 'this', 'text', 'shall', 'train', '.']) assert list(model.hist_words_dct['this'].keys()) == ['text'] assert list(model.hist_words_dct['text'].keys()) == ['.'] assert list(model.hist_words_dct['shall'].keys()) == ['train'] assert list(model.hist_words_dct['train'].keys()) == ['text'] assert list(model.hist_words_dct['PAD'].keys()) == ['this'] assert sorted(list(model.hist_words_dct['.'].keys())) \ == sorted(['PAD', 'shall'])
def test_subsequent_training(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] wh1_len = len(model.word_hists_dct) hw1_len = len(model.hist_words_dct) data = tokenize('This sample.') lm.train(data) model = lm.models[-1] wh2_len = len(model.word_hists_dct) hw2_len = len(model.hist_words_dct) assert wh2_len - wh1_len == 1 assert hw2_len - hw1_len == 1 assert sorted(list(model.word_hists_dct['.'].keys())) \ == sorted(['text', 'sample']) assert sorted(list(model.hist_words_dct['this'].keys())) \ == sorted(['text', 'sample'])
def main(): p = get_argparser() args = p.parse_args() lm = LanguageModel() lm.configure_logger(level=logging.DEBUG if args.DEBUG else logging.INFO, write_file=True) if args.train and args.data_path: lm.train(args.data_path, output_path=args.train, learning_rate=args.learning_rate, hidden_size=args.hidden_size, batch_size=args.batch_size, max_epoch=args.max_epoch) elif args.test and args.data_path: lm.predict(args.test, args.data_path) else: # Well, this is silly. p.print_help() exit(2)
def test_laplace_produces_expected_values2(): lm = LanguageModel(1) data = open_file('kn_test.txt') lm.train(data) assert lm.laplace_evaluate(['text']) == math.log(3 / 12) assert lm.laplace_evaluate(['dog']) == math.log(1 / 12)
waited += 1 if waited >= patience: break era_index += 1 era_loss = 0. era_samples = 0 torch.save(checkpoint, os.path.join(save_dir, f"{era_index}_eras.pt")) return checkpoint if __name__ == "__main__": vocab_path = "data/vocab.txt" in_tokens = 2 embedding_size = 128 with open(vocab_path) as r: vocab = list(map(lambda l: l.strip(), r.readlines())) assert len(vocab) == len(set(vocab)) vocab_size = len(vocab) + 1 model = LanguageModel(in_tokens, vocab_size, embedding_size) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) train(model, optimizer, vocab, ["data/parted/0.txt"], ["data/parted/1.txt"], batch_size=32, max_train_eras=100, batches_per_era=100, max_val_batches=10)
# -*- coding: utf-8 -*- from lm import LanguageModel from memoize import Memoize lm = LanguageModel() def splits(text, max_len=10): return [(text[:i + 1], text[i + 1:]) for i in range(min(len(text), max_len))] @Memoize def segment(text): text = text.strip() if not text: return [] candidates = [[left] + segment(right) for left, right in splits(text)] return max(candidates, key=lm.get_words_prob) if __name__ == '__main__': test = [ 'colorlessgreenideassleepfuriously.', 'ihaveadream.', 'howtotrainadragon.', 'canwetakeaphotoofyou?' ] for text in test: words = segment(text) print(text)
def test_ngram(self): result = LanguageModel(2).get_ngrams(["hello", "world", "lmao"]) self.assertEqual(result, [(None, 'hello'), ('hello', 'world'), ('world', 'lmao'), ('lmao', None)])
def test_discount(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) assert lm.discount == 0.75
def test_p_next_sums_to_one(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert sum(lm.p_next(['this', 'text']).values()) == 1
def setUp(self): self.lm = LanguageModel(3) self.token_sequences = [['the', 'cat', 'runs'], ['the', 'dog', 'runs']] self.lm.train(self.token_sequences)
def test_models_have_correct_vocab_size(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert (lm.models[0].ngram_vocab_size == 7) assert (lm.models[1].ngram_vocab_size == 9)
def test_kn_produces_expected_values_n4(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) assert lm.kn_evaluate(['shall', 'train', 'text', '.']) == -0.7742507185722116
else: source = Reader(options.input) if options.output == '-': writer = sys.stdout else: writer = Writer(options.output) if debug: rules.DEBUG = 1 config = Config(options.config) if logger.level <= logging.INFO: config.write(sys.stderr) lm = LanguageModel(config.lm_file, config.lm_order) rule_table = RuleTable.load(config.rule_table_file, lm, config) extra_feature_funcs = build_extra_feature_funcs(config) recombination_checker = CombinedRecombinationChecker(extra_feature_funcs) decoder = CKYDecoder(config, rule_table, lm, recombination_checker=recombination_checker, extra_feature_funcs=extra_feature_funcs, checking_hypo=checking, expend_loser=expend_loser) logger.info('Start decoding...') def translate(data):
parser.add_argument('--vocab_len', type=float, default=19800, dest='vocab_len') parser.add_argument('--lr', type=float, default=1e-3, dest='lr') parser.add_argument('--minibatch_size', type=int, default=64, dest='minibatch_size') parser.add_argument('--num_epochs', type=int, default=30, dest='num_epochs') parser.add_argument('--models_folder', default='../lm_models', dest='folder') parser.add_argument('--graph_folder', default='../lm_graph', dest='graphs') args = parser.parse_args() # Fit the model if args.mode == 'train': # Read the initial word vectors train_data = np.load(open('lm_train_data.npy','r')) train_labels = np.load(open('lm_train_labels.npy','r')) lm = LanguageModel(args.lr, args.num_steps, args.vocab_len, args.minibatch_size) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) lm.fit(sess, train_data, train_labels, num_epochs=args.num_epochs, folder=args.folder, graph_folder=args.graphs) else: tweets = dill.load(open("tweets", "rb")) w2i = dill.load(open("w2i","rb")) i2w = dill.load(open("i2w","rb")) word_vector = dill.load(open("word_vecs","rb")) start_wd = ["president", "@netanyahu", "democrats", "gop", "congress", "white", "my", "the", "#makeamericagreatagain" ,"republicans", "wall", "@realdonaldtrump", "crooked"] input_list = [np.array([[word_vector[w2i[item]]]]) for item in start_wd] model = LanguageModel(args.lr, args.num_steps, args.vocab_len, args.minibatch_size)
def test_lm_has_correct_number_tokens_and_unigram_types(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.num_tokens == 7 assert len(lm.unigrams) == 5
def main(args): """ Main function of the program operates based on the argument provided. Train - Ask for ngram - Ask for training file path - Train language model - Save the trained model Generate - Load the saved model from pickle file - Ask for a beam search (y/n) - Ask Beam length - Print one generated sentence in terminal - Ask for number of sentences to be generated on file - Save the input number of sentences in a file (Default: new_shakespeare.txt) Perplexity - Load Pickle file - Ask the test set file path - Print perplexity value Common - Load pickle - Ask number of most common ngram - Print the most common ngram with their occurence number. """ if args['train']: if not args['--n']: ngram = input("Please enter n for n-gram (Default: 3)-\n") if not ngram: ngram=3 else: ngram=args['--n'] lm = LanguageModel(int(ngram)) if not args['--path']: path = input("Please enter path of the file-\n") else: path = args['--path'] lm.train(readFile(path)) print("N-gram training completed") print("Saving the model") f = open('trained_model_ngram.pkl','wb') pickle.dump(lm, f) f.close() print("Model saved") if args['generate']: lm = loadPickle() if click.confirm('Do you want to generate with Beam search?', default=True): lm.beam_flag = True beam_size =input("Enter beam size (Default: 20)-\n") if not beam_size: lm.beam_width = beam_size else: lm.beam_flag = False print("Generating one sentence in terminal...") print(detokenize(lm.generate())) if not args['--lines']: noOfText =input("Enter number of generated text you want to save (Default: 10)-\n") if not noOfText: noOfText=10 else: noOfText = args['--lines'] generated = [] for g in range(0, int(noOfText)): generated.append(detokenize(lm.generate())) with open('new_shakespeare.txt', 'w') as f: for g in generated: f.write("%s\n" % g) print("Sentence file generated in current folder") if args['perplexity']: lm = loadPickle() if not args['--path']: path = input("Please enter path of the test file-\n") else: path = args['--path'] print("Perplexity for {}-gram is {}".format(lm.ngram,lm.perplexity(readFile(path)))) if args['common']: lm = loadPickle() if args['--number']: number = args['--number'] else: number = 5 lm.count_common_ngram(int(number))
import pickle from lm import LanguageModel train_filename = "train_sequence.pkl" model_filename = "model.pkl" dataset = pickle.load(open(train_filename, "rb")) lm = LanguageModel(lidstone_param=3e-4) lm.fit(dataset) pickle.dump(lm, open(model_filename, "wb"))