trigram = words_gen[i: i+2] + [token] prob[token] = KN2.calc_kn_prob(d, trigram, vocab_size, diff_bigrams) prob_sum = sum(prob.values()) #print prob_sum for token in prob: prob[token] = float(prob[token])/prob_sum tok = np.random.choice(prob.keys(), 1, p=prob.values()) if tok not in ["UNK", "~"]: words_gen.append(tok[0]) #print words_gen i += 1 print ' '.join(words_gen[0:-1]) print "\n" gutenberg_path = '/home/kunal/Downloads/IISc-Acads/sem2/NLU/Assignment-1/Data/gutenberg/' trigram_object = ngram.N_gram(3, gutenberg_path, thresh = 5) trigram_object.normalize_train_data() #bigram_object.normalize_test_data() KN2.kneser_ney_setup(trigram_object.train_tokens) gen_random_sentence(KN2.uni.keys())
## Author Kunal Chelani ([email protected]) import numpy as np import os import io import re import nltk import random import math import sys from nltk.tokenize import word_tokenize import ngram import prep_data # Script Starts Here gutenberg_path = '/home/kunal/Downloads/IISc-Acads/sem2/NLU/Assignment-1/Data/gutenberg/' bigram = ngram.N_gram(2, gutenberg_path, thresh=7) bigram.normalize_train_data() bigram.update_ngram_prob() bigram.normalize_test_data() bigram.calc_all_perplexity() print bigram.train_perplexity print bigram.test_perplexity