예제 #1
0
def test_models_have_correct_lambda_size():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    for i in range(0, lm.n - 2):
        model = lm.models[i]
        assert len(model.lambdas) == len(model.hist_words_dct)
예제 #2
0
def test_kn_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.kn_evaluate(['text', 'shall', 'train']) == -2.0770634192748685
    assert lm.kn_evaluate(['this', 'text', 'dog']) == -3.1656313103493887
    assert lm.kn_evaluate(['the', 'brown', 'cat']) == -2.4724841297894433
예제 #3
0
def test_models_have_correct_n():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    for i in range(0, lm.n - 2):
        model = lm.models[i]
        assert model.n == i + 2
예제 #4
0
def test_perplexity_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    perp = round(lm.perplexity(2, math.log(0.5)), 5)
    correct = round(math.sqrt(2), 5)
    assert perp == correct
예제 #5
0
def test_models_have_correct_beginning_grams():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert sorted(lm.models[0].beginning_grams) \
     == sorted(['this', 'shall', 'PAD'])
    assert sorted(lm.models[1].beginning_grams) \
     == sorted(['PAD this', 'this text', 'PAD PAD', 'shall train'])
예제 #6
0
def test_laplace_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.laplace_evaluate(['this', 'shall', 'train', 'PAD']) \
     == -2.890371757896165
    assert lm.laplace_evaluate(['dog', 'text', '.', 'PAD']) \
     == (math.log(1 / 9) + math.log(1 / 2))
예제 #7
0
def test_train_creates_expected_hist_words_dict():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    assert sorted(list(model.hist_words_dct.keys())) \
     == sorted(['PAD', 'this', 'text', 'shall', 'train', '.'])
    assert list(model.hist_words_dct['this'].keys()) == ['text']
    assert list(model.hist_words_dct['text'].keys()) == ['.']
    assert list(model.hist_words_dct['shall'].keys()) == ['train']
    assert list(model.hist_words_dct['train'].keys()) == ['text']
    assert list(model.hist_words_dct['PAD'].keys()) == ['this']
    assert sorted(list(model.hist_words_dct['.'].keys())) \
     == sorted(['PAD', 'shall'])
예제 #8
0
def test_subsequent_training():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    wh1_len = len(model.word_hists_dct)
    hw1_len = len(model.hist_words_dct)
    data = tokenize('This sample.')
    lm.train(data)
    model = lm.models[-1]
    wh2_len = len(model.word_hists_dct)
    hw2_len = len(model.hist_words_dct)
    assert wh2_len - wh1_len == 1
    assert hw2_len - hw1_len == 1
    assert sorted(list(model.word_hists_dct['.'].keys())) \
     == sorted(['text', 'sample'])
    assert sorted(list(model.hist_words_dct['this'].keys())) \
     == sorted(['text', 'sample'])
예제 #9
0
def main():
    p = get_argparser()
    args = p.parse_args()

    lm = LanguageModel()
    lm.configure_logger(level=logging.DEBUG if args.DEBUG else logging.INFO,
                        write_file=True)

    if args.train and args.data_path:
        lm.train(args.data_path,
                 output_path=args.train,
                 learning_rate=args.learning_rate,
                 hidden_size=args.hidden_size,
                 batch_size=args.batch_size,
                 max_epoch=args.max_epoch)

    elif args.test and args.data_path:
        lm.predict(args.test, args.data_path)

    else:
        # Well, this is silly.
        p.print_help()
        exit(2)
예제 #10
0
def main(args):
	"""
	Main function of the program operates based on the argument provided.

	Train
		- Ask for ngram
		- Ask for training file path
		- Train language model
		- Save the trained model

	Generate
		- Load the saved model from pickle file
		- Ask for a beam search (y/n)
			- Ask Beam length
		- Print one generated sentence in terminal
		- Ask for number of sentences to be generated on file
		- Save the input number of sentences in a file (Default: new_shakespeare.txt)

	Perplexity
		- Load Pickle file
		- Ask the test set file path
		- Print perplexity value

	Common
		- Load pickle
		- Ask number of most common ngram
		- Print the most common ngram with their occurence number.

	"""
	if args['train']:
		if not args['--n']:
			ngram = input("Please enter n for n-gram (Default: 3)-\n")
			if not ngram:
				ngram=3
		else:
			ngram=args['--n']
		lm = LanguageModel(int(ngram))

		if not args['--path']:
			path = input("Please enter path of the file-\n")
		else:
			path = args['--path']
		lm.train(readFile(path))
		print("N-gram training completed")
		print("Saving the model")
		f = open('trained_model_ngram.pkl','wb')
		pickle.dump(lm, f)
		f.close()
		print("Model saved")

	if args['generate']:
		lm = loadPickle()

		if click.confirm('Do you want to generate with Beam search?', default=True):
			lm.beam_flag = True
			beam_size =input("Enter beam size (Default: 20)-\n")
			if not beam_size:
				lm.beam_width = beam_size
		else:
			lm.beam_flag = False
		print("Generating one sentence in terminal...")
		print(detokenize(lm.generate()))
		if not args['--lines']:
			noOfText =input("Enter number of generated text you want to save (Default: 10)-\n")
			if not noOfText:
				noOfText=10
		else:
			noOfText = args['--lines']
		generated = []
		for g in range(0, int(noOfText)):
			generated.append(detokenize(lm.generate()))

		with open('new_shakespeare.txt', 'w') as f:
			for g in generated:
				f.write("%s\n" % g)
		print("Sentence file generated in current folder")

	if args['perplexity']:
		lm = loadPickle()
		if not args['--path']:
			path = input("Please enter path of the test file-\n")
		else:
			path = args['--path']
		print("Perplexity for {}-gram is {}".format(lm.ngram,lm.perplexity(readFile(path))))

	if args['common']:
		lm = loadPickle()
		if args['--number']:
			number = args['--number']
		else:
			number = 5
		lm.count_common_ngram(int(number))
예제 #11
0
class LanguageModelTests(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        print("\LanguageModelTests starts")
        print("==========")

    @classmethod
    def tearDownClass(cls):
        print("==========")
        print("LanguageModelTests has ended")

    def setUp(self):
        self.lm = LanguageModel(3)
        self.token_sequences = [['the', 'cat', 'runs'], ['the', 'dog', 'runs']]
        self.lm.train(self.token_sequences)

    def test_get_ngrams(self):
        print("id: " + self.id())
        self.lm.n = 4
        input_tokens = ['the', 'cat', 'in', 'the', 'hat']
        result_ngrams = [
            (None, None, None, 'the'), (None, None, 'the', 'cat'),
            (None, 'the', 'cat', 'in'), ('the', 'cat', 'in', 'the'),
            ('cat', 'in', 'the', 'hat'), ('in', 'the', 'hat', None),
            ('the', 'hat', None, None), ('hat', None, None, None)
        ]
        self.assertEqual(self.lm.get_ngrams(input_tokens), result_ngrams)

    def test_train_vocabulary_and_counts(self):
        print("id: " + self.id())
        self.assertEqual(self.lm.vocabulary,
                         {None, 'the', 'cat', 'runs', 'dog'})

        result_counts = {
            (None, None): {
                'the': 2
            },
            (None, 'the'): {
                'cat': 1,
                'dog': 1
            },
            ('the', 'cat'): {
                'runs': 1
            },
            ('cat', 'runs'): {
                None: 1
            },
            ('runs', None): {
                None: 2
            },
            ('the', 'dog'): {
                'runs': 1
            },
            ('dog', 'runs'): {
                None: 1
            }
        }
        self.assertEqual(self.lm.counts, result_counts)

    def test_normalize(self):
        print("id: " + self.id())
        input_words = {'cat': 1, 'dog': 1}
        result_probabilities = {'cat': 0.5, 'dog': 0.5}
        self.assertEqual(self.lm.normalize(input_words), result_probabilities)

    def test_normalize_sum_probabilies(self):
        print("id: " + self.id())
        input_words = {'cat': 1, 'dog': 1}
        probabilities = self.lm.normalize(input_words)

        prob_sum = 0
        for key in probabilities:
            prob_sum += probabilities[key]
        self.assertEqual(prob_sum, 1)

    def test_predict_next(self):
        print("id: " + self.id())
        input_tokens = [None, "zero", None, 'the', 'dog']
        result_probabilities = {'runs': 1}
        self.assertEqual(self.lm.p_next(input_tokens), result_probabilities)

    def test_sample(self):
        print("id: " + self.id())
        input_probability_distribution = {'heads': 0.5, 'tails': 0.5}
        predicted_word = self.lm.sample(input_probability_distribution)[0]
        self.assertIn(predicted_word, input_probability_distribution)
예제 #12
0
def test_p_next_sums_to_one():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert sum(lm.p_next(['this', 'text']).values()) == 1
예제 #13
0
def test_kn_produces_expected_values_n4():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.kn_evaluate(['shall', 'train', 'text',
                           '.']) == -0.7742507185722116
예제 #14
0
def test_models_have_correct_vocab_size():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert (lm.models[0].ngram_vocab_size == 7)
    assert (lm.models[1].ngram_vocab_size == 9)
예제 #15
0
def test_laplace_produces_expected_values2():
    lm = LanguageModel(1)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.laplace_evaluate(['text']) == math.log(3 / 12)
    assert lm.laplace_evaluate(['dog']) == math.log(1 / 12)
예제 #16
0
def test_discount():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.discount == 0.75
예제 #17
0
def test_lm_has_correct_number_tokens_and_unigram_types():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.num_tokens == 7
    assert len(lm.unigrams) == 5