예제 #1
0
    def testRawProbabilities(self):
        model = NgramsModel(2)
        model.count( self.corpusfile )
        probas = model.probabilities( method="raw" )
        self.assertEqual(len(probas), 2)

        unigram = probas[0]
        for token,value,bo in unigram:
            if token=="a":
                self.assertEqual(value, 15)
            if token=='b':
                self.assertEqual(value, 10)
            if token=='c':
                self.assertEqual(value, 4)
            if token=='d':
                self.assertEqual(value, 3)
            if token==START_SENT_SYMBOL:
                self.assertEqual(value, 0)
            if token==END_SENT_SYMBOL:
                self.assertEqual(value, 3)

        bigram = probas[1]
        for token,value,bo in bigram:
            if token=="a b":
                self.assertEqual(value, 7)
            if token=="b a":
                self.assertEqual(value, 4)
            if token==START_SENT_SYMBOL+' a':
                self.assertEqual(value, 3)
            if token=='b '+END_SENT_SYMBOL:
                self.assertEqual(value, 3)

        probas = model.probabilities( method="lograw" )
        self.assertEqual(len(probas), 2)

        unigram = probas[0]
        for token,value,bo in unigram:
            if token=="a":
                self.assertEqual(value, math.log(15, 10))
            if token=='b':
                self.assertEqual(value, math.log(10, 10))
            if token=='c':
                self.assertEqual(value, math.log(4, 10))
            if token=='d':
                self.assertEqual(value, math.log(3, 10))
            if token==START_SENT_SYMBOL:
                self.assertEqual(value, -99)
            if token==END_SENT_SYMBOL:
                self.assertEqual(value, math.log(3, 10))

        bigram = probas[1]
        for token,value,bo in bigram:
            if token=="a b":
                self.assertEqual(value, math.log(7, 10))
            if token=="b a":
                self.assertEqual(value, math.log(4, 10))
            if token==START_SENT_SYMBOL+' a':
                self.assertEqual(value, math.log(3, 10))
            if token=='b '+END_SENT_SYMBOL:
                self.assertEqual(value, math.log(3, 10))
예제 #2
0
 def testCount(self):
     model = NgramsModel(2)
     model.count( self.corpusfile )
     self.assertEqual(len(model._ngramcounts), 2)
     ngramcounter = model._ngramcounts[0]
     self.assertEqual(ngramcounter.get_count('a'), 15)
     self.assertEqual(ngramcounter.get_count('b'), 10)
     self.assertEqual(ngramcounter.get_count('c'), 4)
     self.assertEqual(ngramcounter.get_count('d'), 3)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
     ngramcounter = model._ngramcounts[1]
     self.assertEqual(ngramcounter.get_count('a b'), 7)
     self.assertEqual(ngramcounter.get_count('b a'), 4)
     self.assertEqual(ngramcounter.get_count('d b'), 1)
     self.assertEqual(ngramcounter.get_count('d c'), 2)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3)
     self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
예제 #3
0
    def testARPA(self):
        fn1 = os.path.join(TEMP,"model1.arpa")
        fn2 = os.path.join(TEMP,"model2.arpa")
        model = NgramsModel(3)
        model.count( self.corpusfile )
        probas = model.probabilities("logml")
        arpaio = ArpaIO()
        arpaio.set( probas )
        arpaio.save( fn1 )

        slm1 = SLM()
        slm1.load_from_arpa( fn1 )
        slm1.save_as_arpa( fn2 )

        slm2 = SLM()
        slm2.load_from_arpa( fn2 )

        m1 = slm1.model
        m2 = slm2.model
        self.assertTrue( compare(m1,m2) )
예제 #4
0
# ----------------------------------------------------------------------------
# Main program
# ----------------------------------------------------------------------------

# ---------------------------------
# 1. Create a NgramsModel

model = NgramsModel( args.n )
if args.r:
    model.set_vocab( args.r )

# ---------------------------------
# 2. Estimate counts of each n-gram

model.count( *(args.i) )

# ---------------------------------
# 3. Estimate probabilities

probas = model.probabilities( args.m )

# ---------------------------------
# 4. Write in an ARPA file

arpaio = ArpaIO()
arpaio.set( probas )
arpaio.save( args.o )

# ---------------------------------------------------------------------------
예제 #5
0
    def testMaximumLikelihoodProbabilities(self):
        model = NgramsModel(3)
        model.count( self.corpusfile )
        probas = model.probabilities( method="ml" )
        self.assertEqual(len(probas), 3)

        unigram = probas[0]
        for token,value,bo in unigram:
            if token=="a":
                self.assertEqual(round(value,6), 0.428571)
            if token=="b":
                self.assertEqual(round(value,6), 0.285714)
            if token=="c":
                self.assertEqual(round(value,6), 0.114286)
            if token=="d":
                self.assertEqual(round(value,6), 0.085714)
            if token==START_SENT_SYMBOL:
                self.assertEqual(round(value,6), 0.)
            if token==END_SENT_SYMBOL:
                self.assertEqual(round(value,6), 0.085714)

        bigram = probas[1]
        for token,value,bo in bigram:
            if token=="a b":
                self.assertEqual(round(value,6), 0.466667)
            if token=="b a":
                self.assertEqual(round(value,6), 0.400000)

        trigram = probas[2]
        for token,value,bo in trigram:
            if token=="a b a":
                self.assertEqual(round(value,6), 0.142857)
            if token==START_SENT_SYMBOL+"a a":
                self.assertEqual(round(value,6), 0.500000)
            if token=="a b"+END_SENT_SYMBOL:
                self.assertEqual(round(value,6), 0.428571)

        probas = model.probabilities( method="logml" )
        self.assertEqual(len(probas), 3)

        unigram = probas[0]
        for token,value,bo in unigram:
            if token=="a":
                self.assertEqual(round(value,6), round(math.log(0.42857143,10),6))
            if token=="b":
                self.assertEqual(round(value,6), round(math.log(0.28571429,10),6))
            if token=="c":
                self.assertEqual(round(value,6), round(math.log(0.11428571,10),6))
            if token=="d":
                self.assertEqual(round(value,6), round(math.log(0.08571429,10),6))
            if token==START_SENT_SYMBOL:
                self.assertEqual(round(value,6), -99.000000)
            if token==END_SENT_SYMBOL:
                self.assertEqual(round(value,6), round(math.log(0.08571429,10),6))

        bigram = probas[1]
        for token,value,bo in bigram:
            if token=="a b":
                self.assertEqual(round(value,6), round(math.log(0.466667,10),6))
            if token=="b a":
                self.assertEqual(round(value,6), round(math.log(0.400000,10),6))

        trigram = probas[2]
        for token,value,bo in trigram:
            if token=="a b a":
                self.assertEqual(round(value,6), round(math.log(0.142857,10),6))
            if token==START_SENT_SYMBOL+"a a":
                self.assertEqual(round(value,6), round(math.log(0.500000,10),6))
            if token=="a b"+END_SENT_SYMBOL:
                self.assertEqual(round(value,6), round(math.log(0.428571,10),6))