def testInit(self): with self.assertRaises(NgramOrderValueError): m = sppasNgramsModel(0) m = sppasNgramsModel(100) m = sppasNgramsModel(1) with self.assertRaises(NgramCountValueError): m.set_min_count(0) m.set_min_count("a") with self.assertRaises(NgramMethodNameError): p = m.probabilities(method="toto")
def gen_slm_dependencies(self, basename, N=3): """ Generate the dependencies (slm, dictionary) for julius. :param basename: (str) the base name of the slm file and of the dictionary file :param N: (int) Language model N-gram length. """ dictname = basename + ".dict" slmname = basename + ".arpa" phoneslist = self._phones.split() tokenslist = self._tokens.split() dictpron = sppasDictPron() for token, pron in zip(tokenslist, phoneslist): for variant in pron.split("|"): dictpron.add_pron(token, variant.replace("-", " ")) if dictpron.is_unk(START_SENT_SYMBOL) is True: dictpron.add_pron(START_SENT_SYMBOL, "sil") if dictpron.is_unk(END_SENT_SYMBOL) is True: dictpron.add_pron( END_SENT_SYMBOL, "sil") dictpron.save_as_ascii(dictname, False) # Write the SLM model = sppasNgramsModel(N) model.append_sentences([self._tokens]) probas = model.probabilities(method="logml") arpaio = sppasArpaIO() arpaio.set(probas) arpaio.save(slmname)
def testARPA(self): arpaio = sppasArpaIO() with self.assertRaises(ModelsDataTypeError): arpaio.set("toto") arpaio.set([]) arpaio.set([[], 0]) fn1 = os.path.join(TEMP, "model1.arpa") fn2 = os.path.join(TEMP, "model2.arpa") model = sppasNgramsModel(3) model.count(self.corpusfile) probas = model.probabilities("logml") arpaio.set(probas) arpaio.save(fn1) slm1 = sppasSLM() slm1.load_from_arpa(fn1) slm1.save_as_arpa(fn2) slm2 = sppasSLM() slm2.load_from_arpa(fn2) m1 = slm1.model m2 = slm2.model sp = sppasCompare() self.assertTrue(sp.equals(m1, m2))
def testRawProbabilities(self): model = sppasNgramsModel(2) model.count(self.corpusfile) probas = model.probabilities(method="raw") self.assertEqual(len(probas), 2) unigram = probas[0] for token, value, bo in unigram: if token == "a": self.assertEqual(value, 15) if token == 'b': self.assertEqual(value, 10) if token == 'c': self.assertEqual(value, 4) if token == 'd': self.assertEqual(value, 3) if token == START_SENT_SYMBOL: self.assertEqual(value, 0) if token == END_SENT_SYMBOL: self.assertEqual(value, 3) bigram = probas[1] for token, value, bo in bigram: if token == "a b": self.assertEqual(value, 7) if token == "b a": self.assertEqual(value, 4) if token == START_SENT_SYMBOL + ' a': self.assertEqual(value, 3) if token == 'b ' + END_SENT_SYMBOL: self.assertEqual(value, 3) probas = model.probabilities(method="lograw") self.assertEqual(len(probas), 2) unigram = probas[0] for token, value, bo in unigram: if token == "a": self.assertEqual(value, math.log(15, 10)) if token == 'b': self.assertEqual(value, math.log(10, 10)) if token == 'c': self.assertEqual(value, math.log(4, 10)) if token == 'd': self.assertEqual(value, math.log(3, 10)) if token == START_SENT_SYMBOL: self.assertEqual(value, -99) if token == END_SENT_SYMBOL: self.assertEqual(value, math.log(3, 10)) bigram = probas[1] for token, value, bo in bigram: if token == "a b": self.assertEqual(value, math.log(7, 10)) if token == "b a": self.assertEqual(value, math.log(4, 10)) if token == START_SENT_SYMBOL + ' a': self.assertEqual(value, math.log(3, 10)) if token == 'b ' + END_SENT_SYMBOL: self.assertEqual(value, math.log(3, 10))
def gen_slm_dependencies(self, basename, N=3): """Generate the dependencies (slm, dictionary) for julius. :param basename: (str) base name of the slm and dictionary files :param N: (int) Language model N-gram length. """ dict_name = basename + ".dict" slm_name = basename + ".arpa" phoneslist = self._phones.split() tokenslist = self._tokens.split() dictpron = sppasDictPron() for token, pron in zip(tokenslist, phoneslist): for variant in pron.split("|"): dictpron.add_pron(token, variant.replace("-", " ")) if dictpron.is_unk(START_SENT_SYMBOL) is True: dictpron.add_pron(START_SENT_SYMBOL, SIL_PHON) if dictpron.is_unk(END_SENT_SYMBOL) is True: dictpron.add_pron(END_SENT_SYMBOL, SIL_PHON) dictpron.save_as_ascii(dict_name, False) # Write the SLM model = sppasNgramsModel(N) model.append_sentences([self._tokens]) probas = model.probabilities(method="logml") arpaio = sppasArpaIO() arpaio.set(probas) arpaio.save(slm_name)
def testRawProbabilities(self): model = sppasNgramsModel(2) model.count(self.corpusfile) probas = model.probabilities(method="raw") self.assertEqual(len(probas), 2) unigram = probas[0] for token, value, bo in unigram: if token == "a": self.assertEqual(value, 15) if token == 'b': self.assertEqual(value, 10) if token == 'c': self.assertEqual(value, 4) if token == 'd': self.assertEqual(value, 3) if token == START_SENT_SYMBOL: self.assertEqual(value, 0) if token == END_SENT_SYMBOL: self.assertEqual(value, 3) bigram = probas[1] for token, value, bo in bigram: if token == "a b": self.assertEqual(value, 7) if token == "b a": self.assertEqual(value, 4) if token == START_SENT_SYMBOL+' a': self.assertEqual(value, 3) if token == 'b '+END_SENT_SYMBOL: self.assertEqual(value, 3) probas = model.probabilities(method="lograw") self.assertEqual(len(probas), 2) unigram = probas[0] for token, value, bo in unigram: if token == "a": self.assertEqual(value, math.log(15, 10)) if token == 'b': self.assertEqual(value, math.log(10, 10)) if token == 'c': self.assertEqual(value, math.log(4, 10)) if token == 'd': self.assertEqual(value, math.log(3, 10)) if token == START_SENT_SYMBOL: self.assertEqual(value, -99) if token == END_SENT_SYMBOL: self.assertEqual(value, math.log(3, 10)) bigram = probas[1] for token, value, bo in bigram: if token == "a b": self.assertEqual(value, math.log(7, 10)) if token == "b a": self.assertEqual(value, math.log(4, 10)) if token == START_SENT_SYMBOL+' a': self.assertEqual(value, math.log(3, 10)) if token == 'b '+END_SENT_SYMBOL: self.assertEqual(value, math.log(3, 10))
def testCount(self): model = sppasNgramsModel(2) model.count(self.corpusfile) self.assertEqual(len(model._ngramcounts), 2) ngramcounter = model._ngramcounts[0] self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3) ngramcounter = model._ngramcounts[1] self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count('d b'), 1) self.assertEqual(ngramcounter.get_count('d c'), 2) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL + ' a'), 3) self.assertEqual(ngramcounter.get_count('b ' + END_SENT_SYMBOL), 3)
def testCount(self): model = sppasNgramsModel(2) model.count(self.corpusfile) self.assertEqual(len(model._ngramcounts), 2) ngramcounter = model._ngramcounts[0] self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3) ngramcounter = model._ngramcounts[1] self.assertEqual(ngramcounter.get_count('a b'), 7) self.assertEqual(ngramcounter.get_count('b a'), 4) self.assertEqual(ngramcounter.get_count('d b'), 1) self.assertEqual(ngramcounter.get_count('d c'), 2) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3) self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
def testARPA(self): fn1 = os.path.join(TEMP, "model1.arpa") fn2 = os.path.join(TEMP, "model2.arpa") model = sppasNgramsModel(3) model.count(self.corpusfile) probas = model.probabilities("logml") arpaio = sppasArpaIO() arpaio.set(probas) arpaio.save(fn1) slm1 = sppasSLM() slm1.load_from_arpa(fn1) slm1.save_as_arpa(fn2) slm2 = sppasSLM() slm2.load_from_arpa(fn2) m1 = slm1.model m2 = slm2.model sp = sppasCompare() self.assertTrue(sp.equals(m1, m2))
def testMaximumLikelihoodProbabilities(self): model = sppasNgramsModel(3) model.count(self.corpusfile) probas = model.probabilities(method="ml") self.assertEqual(len(probas), 3) unigram = probas[0] for token, value, bo in unigram: if token == "a": self.assertEqual(round(value, 6), 0.428571) if token == "b": self.assertEqual(round(value, 6), 0.285714) if token == "c": self.assertEqual(round(value, 6), 0.114286) if token == "d": self.assertEqual(round(value, 6), 0.085714) if token == START_SENT_SYMBOL: self.assertEqual(round(value, 6), 0.) if token == END_SENT_SYMBOL: self.assertEqual(round(value, 6), 0.085714) bigram = probas[1] for token, value, bo in bigram: if token == "a b": self.assertEqual(round(value, 6), 0.466667) if token == "b a": self.assertEqual(round(value, 6), 0.400000) trigram = probas[2] for token, value, bo in trigram: if token == "a b a": self.assertEqual(round(value, 6), 0.142857) if token == START_SENT_SYMBOL + "a a": self.assertEqual(round(value, 6), 0.500000) if token == "a b" + END_SENT_SYMBOL: self.assertEqual(round(value, 6), 0.428571) probas = model.probabilities(method="logml") self.assertEqual(len(probas), 3) unigram = probas[0] for token, value, bo in unigram: if token == "a": self.assertEqual(round(value, 6), round(math.log(0.42857143, 10), 6)) if token == "b": self.assertEqual(round(value, 6), round(math.log(0.28571429, 10), 6)) if token == "c": self.assertEqual(round(value, 6), round(math.log(0.11428571, 10), 6)) if token == "d": self.assertEqual(round(value, 6), round(math.log(0.08571429, 10), 6)) if token == START_SENT_SYMBOL: self.assertEqual(round(value, 6), -99.000000) if token == END_SENT_SYMBOL: self.assertEqual(round(value, 6), round(math.log(0.08571429, 10), 6)) bigram = probas[1] for token, value, bo in bigram: if token == "a b": self.assertEqual(round(value, 6), round(math.log(0.466667, 10), 6)) if token == "b a": self.assertEqual(round(value, 6), round(math.log(0.400000, 10), 6)) trigram = probas[2] for token, value, bo in trigram: if token == "a b a": self.assertEqual(round(value, 6), round(math.log(0.142857, 10), 6)) if token == START_SENT_SYMBOL + "a a": self.assertEqual(round(value, 6), round(math.log(0.500000, 10), 6)) if token == "a b" + END_SENT_SYMBOL: self.assertEqual(round(value, 6), round(math.log(0.428571, 10), 6))
def testMaximumLikelihoodProbabilities(self): model = sppasNgramsModel(3) model.count(self.corpusfile) probas = model.probabilities(method="ml") self.assertEqual(len(probas), 3) unigram = probas[0] for token, value, bo in unigram: if token == "a": self.assertEqual(round(value, 6), 0.428571) if token == "b": self.assertEqual(round(value, 6), 0.285714) if token == "c": self.assertEqual(round(value, 6), 0.114286) if token == "d": self.assertEqual(round(value, 6), 0.085714) if token == START_SENT_SYMBOL: self.assertEqual(round(value, 6), 0.) if token == END_SENT_SYMBOL: self.assertEqual(round(value, 6), 0.085714) bigram = probas[1] for token, value, bo in bigram: if token == "a b": self.assertEqual(round(value, 6), 0.466667) if token == "b a": self.assertEqual(round(value, 6), 0.400000) trigram = probas[2] for token, value, bo in trigram: if token == "a b a": self.assertEqual(round(value, 6), 0.142857) if token == START_SENT_SYMBOL+"a a": self.assertEqual(round(value, 6), 0.500000) if token == "a b"+END_SENT_SYMBOL: self.assertEqual(round(value, 6), 0.428571) probas = model.probabilities(method="logml") self.assertEqual(len(probas), 3) unigram = probas[0] for token, value, bo in unigram: if token == "a": self.assertEqual(round(value, 6), round(math.log(0.42857143, 10), 6)) if token == "b": self.assertEqual(round(value, 6), round(math.log(0.28571429, 10), 6)) if token == "c": self.assertEqual(round(value, 6), round(math.log(0.11428571, 10), 6)) if token == "d": self.assertEqual(round(value, 6), round(math.log(0.08571429, 10), 6)) if token == START_SENT_SYMBOL: self.assertEqual(round(value, 6), -99.000000) if token == END_SENT_SYMBOL: self.assertEqual(round(value, 6), round(math.log(0.08571429, 10), 6)) bigram = probas[1] for token, value, bo in bigram: if token == "a b": self.assertEqual(round(value, 6), round(math.log(0.466667, 10), 6)) if token == "b a": self.assertEqual(round(value, 6), round(math.log(0.400000, 10), 6)) trigram = probas[2] for token, value, bo in trigram: if token == "a b a": self.assertEqual(round(value, 6), round(math.log(0.142857, 10), 6)) if token == START_SENT_SYMBOL+"a a": self.assertEqual(round(value, 6), round(math.log(0.500000, 10), 6)) if token == "a b"+END_SENT_SYMBOL: self.assertEqual(round(value, 6), round(math.log(0.428571, 10), 6))
action='store_true', help="Disable the verbosity.") if len(sys.argv) <= 1: sys.argv.append('-h') args = parser.parse_args() # ---------------------------------------------------------------------------- # Main program # ---------------------------------------------------------------------------- # --------------------------------- # 1. Create a sppasNgramsModel model = sppasNgramsModel(args.n) if args.r: model.set_vocab(args.r) # --------------------------------- # 2. Estimate counts of each n-gram model.count(*(args.i)) # --------------------------------- # 3. Estimate probabilities probas = model.probabilities(args.m) # --------------------------------- # 4. Write in an ARPA file