def main(): global nGramList, tokenList print "Perplexity for Dataset 3 using Laplace Smoothing" nGramList = task_one.generateNgram(N,task_one.corpus3) fileText = task_one.readFile(task_one.corpus3test) tokenList = task_one.modifyFile(fileText) extendedMain()
def main(): global nGramList, tokenList print "Perplexity for Dataset 4 using Laplace Smoothing" nGramList = task_one.generateNgram(N, task_one.corpus4) fileText = task_one.readFile(task_one.corpus4test) tokenList = task_one.modifyFile(fileText) extendedMain()
#Task 3 : Handle unknown words and implement smoothing from __future__ import division import math import task_one N = 2 print "Perplexity for Dataset 3 using Laplace Smoothing" nGramList = task_one.generateNgram(N,task_one.corpus3) laplaceList = [] fileText = task_one.readFile(task_one.corpus3test) tokenList = task_one.modifyFile(fileText) nGramTestList = [] def findVocabCount(): vocabCount = 0 unigram = nGramList[0] for t in unigram: vocabCount = vocabCount + unigram[t] return vocabCount def combineTokens(listWithUnknown): dictTokensWithUnknown = dict() #print listWithUnknown for n in listWithUnknown: dictTokensWithUnknown = dict(dictTokensWithUnknown.items() + n.items()) #print dictTokensWithUnknown return dictTokensWithUnknown