def testRawProbabilities(self): model = NgramsModel(2) model.count( self.corpusfile ) probas = model.probabilities( method="raw" ) self.assertEqual(len(probas), 2) unigram = probas[0] for token,value,bo in unigram: if token=="a": self.assertEqual(value, 15) if token=='b': self.assertEqual(value, 10) if token=='c': self.assertEqual(value, 4) if token=='d': self.assertEqual(value, 3) if token==START_SENT_SYMBOL: self.assertEqual(value, 0) if token==END_SENT_SYMBOL: self.assertEqual(value, 3) bigram = probas[1] for token,value,bo in bigram: if token=="a b": self.assertEqual(value, 7) if token=="b a": self.assertEqual(value, 4) if token==START_SENT_SYMBOL+' a': self.assertEqual(value, 3) if token=='b '+END_SENT_SYMBOL: self.assertEqual(value, 3) probas = model.probabilities( method="lograw" ) self.assertEqual(len(probas), 2) unigram = probas[0] for token,value,bo in unigram: if token=="a": self.assertEqual(value, math.log(15, 10)) if token=='b': self.assertEqual(value, math.log(10, 10)) if token=='c': self.assertEqual(value, math.log(4, 10)) if token=='d': self.assertEqual(value, math.log(3, 10)) if token==START_SENT_SYMBOL: self.assertEqual(value, -99) if token==END_SENT_SYMBOL: self.assertEqual(value, math.log(3, 10)) bigram = probas[1] for token,value,bo in bigram: if token=="a b": self.assertEqual(value, math.log(7, 10)) if token=="b a": self.assertEqual(value, math.log(4, 10)) if token==START_SENT_SYMBOL+' a': self.assertEqual(value, math.log(3, 10)) if token=='b '+END_SENT_SYMBOL: self.assertEqual(value, math.log(3, 10))
def testCount(self): model = NgramsModel(2) model.count( self.corpusfile ) self.assertEqual(len(model._ngramcounts), 2) ngramcounter = model._ngramcounts[0] self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3) ngramcounter = model._ngramcounts[1] self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count('d b'), 1) self.assertEqual(ngramcounter.get_count('d c'), 2) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
def testARPA(self): fn1 = os.path.join(TEMP,"model1.arpa") fn2 = os.path.join(TEMP,"model2.arpa") model = NgramsModel(3) model.count( self.corpusfile ) probas = model.probabilities("logml") arpaio = ArpaIO() arpaio.set( probas ) arpaio.save( fn1 ) slm1 = SLM() slm1.load_from_arpa( fn1 ) slm1.save_as_arpa( fn2 ) slm2 = SLM() slm2.load_from_arpa( fn2 ) m1 = slm1.model m2 = slm2.model self.assertTrue( compare(m1,m2) )
# ---------------------------------------------------------------------------- # Main program # ---------------------------------------------------------------------------- # --------------------------------- # 1. Create a NgramsModel model = NgramsModel( args.n ) if args.r: model.set_vocab( args.r ) # --------------------------------- # 2. Estimate counts of each n-gram model.count( *(args.i) ) # --------------------------------- # 3. Estimate probabilities probas = model.probabilities( args.m ) # --------------------------------- # 4. Write in an ARPA file arpaio = ArpaIO() arpaio.set( probas ) arpaio.save( args.o ) # ---------------------------------------------------------------------------
def testMaximumLikelihoodProbabilities(self): model = NgramsModel(3) model.count( self.corpusfile ) probas = model.probabilities( method="ml" ) self.assertEqual(len(probas), 3) unigram = probas[0] for token,value,bo in unigram: if token=="a": self.assertEqual(round(value,6), 0.428571) if token=="b": self.assertEqual(round(value,6), 0.285714) if token=="c": self.assertEqual(round(value,6), 0.114286) if token=="d": self.assertEqual(round(value,6), 0.085714) if token==START_SENT_SYMBOL: self.assertEqual(round(value,6), 0.) if token==END_SENT_SYMBOL: self.assertEqual(round(value,6), 0.085714) bigram = probas[1] for token,value,bo in bigram: if token=="a b": self.assertEqual(round(value,6), 0.466667) if token=="b a": self.assertEqual(round(value,6), 0.400000) trigram = probas[2] for token,value,bo in trigram: if token=="a b a": self.assertEqual(round(value,6), 0.142857) if token==START_SENT_SYMBOL+"a a": self.assertEqual(round(value,6), 0.500000) if token=="a b"+END_SENT_SYMBOL: self.assertEqual(round(value,6), 0.428571) probas = model.probabilities( method="logml" ) self.assertEqual(len(probas), 3) unigram = probas[0] for token,value,bo in unigram: if token=="a": self.assertEqual(round(value,6), round(math.log(0.42857143,10),6)) if token=="b": self.assertEqual(round(value,6), round(math.log(0.28571429,10),6)) if token=="c": self.assertEqual(round(value,6), round(math.log(0.11428571,10),6)) if token=="d": self.assertEqual(round(value,6), round(math.log(0.08571429,10),6)) if token==START_SENT_SYMBOL: self.assertEqual(round(value,6), -99.000000) if token==END_SENT_SYMBOL: self.assertEqual(round(value,6), round(math.log(0.08571429,10),6)) bigram = probas[1] for token,value,bo in bigram: if token=="a b": self.assertEqual(round(value,6), round(math.log(0.466667,10),6)) if token=="b a": self.assertEqual(round(value,6), round(math.log(0.400000,10),6)) trigram = probas[2] for token,value,bo in trigram: if token=="a b a": self.assertEqual(round(value,6), round(math.log(0.142857,10),6)) if token==START_SENT_SYMBOL+"a a": self.assertEqual(round(value,6), round(math.log(0.500000,10),6)) if token=="a b"+END_SENT_SYMBOL: self.assertEqual(round(value,6), round(math.log(0.428571,10),6))