# load bigram bigrams = load_bigram(open(argv[1])) # load unigram uni = load_unigram(open(argv[2])) words = uni['words'] total_tokens = uni['total_tokens'] training_gram = Ngram(total_tokens, words, bigrams) # load x, y, smooth_method x = argv[3] y = argv[4] # exit if x is non-existent in training if argv[3] not in words: print 'We are incredibly sorry, but the word you requested was not found in the training set' exit() # return probability if bigram has been seen in training if (x, y) in bigrams: print 'Pr({}|{}) = {}'.format( y, x, training_gram.get_prob(x, y, smooth_index)) else: # bigram (x,y) has not been seen, calculate probability for specific smoothing for bigram (x,y) if smooth_method == 'M': print "Pr({}|{}) = {}".format(y, x, training_gram.mle(x, y)) elif smooth_method == 'L': print "Pr({}|{}) = {}".format(y, x, training_gram.laplace_bigram(x, y)) elif smooth_method == 'I': print "Pr({}|{}) = {}".format( y, x, training_gram.interpolation(x, y, 0.3)) else: print "Pr({}|{}) = {}".format(y, x, training_gram.pr_k(x, y))
# calculate perplexities test_words = [] sentences = sentence_segmentation(open(argv[3])) for sen in sentences: tokens = tokenization(start_sym + ' ' + sen + ' ' + end_sym) for tok in tokens: if tok == '': continue tok = tok.lower() test_words.append(tok) test_size = len(test_words) # calculating perplexities bi_perplexity = 0 inter_perplexity = 0 uni_perplexity = 0 x = start_sym # calculate summation uni_perplexity += log(training_gram.laplace_unigram(x), 2) for y in test_words[1:]: bi_perplexity += log(training_gram.laplace_bigram(x, y), 2) inter_perplexity += log(training_gram.interpolation(x, y, 0.3), 2) uni_perplexity += log(training_gram.laplace_unigram(y), 2) x = y # calculate perplexities bi_perplexity = pow(2, (-1 / float(test_size)) * bi_perplexity) inter_perplexity = pow(2, (-1 / float(test_size)) * inter_perplexity) uni_perplexity = pow(2, (-1 / float(test_size)) * uni_perplexity) # output perplexities out = "Laplace Bigram: {}\nInterpolated Bigram: {}\nLaplace Unigram: {}".format( str(bi_perplexity), str(inter_perplexity), str(uni_perplexity)) print out