def testNormalizePunctuation(self): f = LMPreparationFormula() f.setText(u"".join(string.punctuation + u"‰")) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = u"$%&'-/@\u2030" self.assertEquals(gt, strResult) f.setLanguageId(1) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = "dollars pourcent et '-/ at pour mille" self.assertEquals(gt, strResult)
def testNormalizePunctuation(self): f = LMPreparationFormula() f.setText(u"".join(string.punctuation + u"‰")) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = u"$%&'@\u2030" self.assertEquals(gt, strResult) f.setLanguageId(1) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = "dollars pourcent et ' at pour mille" self.assertEquals(gt, strResult)
def testNormalizePunctuation(self): f = LMPreparationFormula() f.setText(u"".join(string.punctuation)) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = "$%&' @" self.assertEquals(gt, strResult) f.setLanguageId(1) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = "dollars pourcent et ' at" self.assertEquals(gt, strResult)
def testNormalizeUtf8(self): languages = ['0', '1', '2'] testList = {} for lang in languages: testList[lang] = [] for match, sub, comment, languageId in UTF8MAP: for lang in languages: if (lang == int(languageId)): testList[lang].append(match) gtList = {} for lang in languages: gtList[lang] = [] for match, sub, comment, languageId in UTF8MAP: for lang in languages: if (lang == int(languageId)): gtList[lang].append(sub) for lang in languages: strGt = u" ".join(gtList[lang]) strGt = strGt.rstrip().strip() strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE) f = LMPreparationFormula() f.setText(u" ".join(testList[lang])) f._normalizeUtf8() strResult = f.getText() self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))
def testNormalizeUtf8(self): languages = ['0', '1', '2'] testList = {} for lang in languages: testList[lang] = [] for match, sub, comment, languageId in UTF8MAP: for lang in languages: if (lang == int(languageId)): testList[lang].append(match) gtList = {} for lang in languages: gtList[lang] = [] for match, sub, comment, languageId in UTF8MAP: for lang in languages: if (lang == int(languageId)): gtList[lang].append(sub) for lang in languages: strGt = u" ".join(gtList[lang]) strGt = strGt.rstrip().strip() strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE) f = LMPreparationFormula() f.setText(u" ".join(testList[lang])) f._normalizeUtf8() strResult = f.getText() self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))
def testNormalizePunctuation(self): f = LMPreparationFormula() f.setText("".join(string.punctuation + "‰")) f.setExpandNumberInWords(False) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = "$%&'-/@‰" self.assertEqual(gt, strResult) f.setLanguageId(1) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = "dollars pourcent et '-/ at pour mille" self.assertEqual(gt, strResult)
def testNormalizeCharacters(self): strTest = ur"a b c \uff1b , % œ" strGt = ur"a b c % oe" f = LMPreparationFormula() f.setText(strTest) f._normalizeUtf8() f._normalizePunctuation(self.allPunctList) self.assertEquals(strGt, f.getText())
def testNormalizeCharacters(self): strTest = ur"a b c \uff1b , % œ" strGt = ur"a b c % oe" f = LMPreparationFormula() f.setText(strTest) f._normalizeUtf8() f._normalizePunctuation(self.allPunctList) self.assertEquals(strGt, f.getText())
def testNormalizePunctuationKeepInWords(self): f = LMPreparationFormula() f.setKeepNewWords(True) f.setText(u"".join("/ HES-SO und AdG/LA - auch im Winter / Sommer -")) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = "HES-SO und AdG/LA auch im Winter Sommer" self.assertEquals(gt, strResult)
def testNormalizePunctuationKeepInWords(self): f = LMPreparationFormula() f.setKeepNewWords(True) f.setText(u"".join("/ HES-SO und AdG/LA - auch im Winter / Sommer -")) f._normalizePunctuation(self.allPunctList) strResult = f.getText() gt = "HES-SO und AdG/LA auch im Winter Sommer" self.assertEquals(gt, strResult)
def testNormalizeUtf8(self): testList = [] for match, sub, comment, languageId in UTF8MAP: testList.append(match) gtList = [] for match, sub, comment, languageId in UTF8MAP: gtList.append(sub) strGt = u" ".join(gtList) strGt = strGt.rstrip().strip() strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE) f = LMPreparationFormula() f.setText(u" ".join(testList)) f._normalizeUtf8() strResult = f.getText() self.assertEquals(strGt.encode("utf-8"), strResult.encode("utf-8"))
def testNormalizeUtf8(self): testList = [] for match, sub, comment, languageId in UTF8MAP: testList.append(match) gtList = [] for match, sub, comment, languageId in UTF8MAP: gtList.append(sub) strGt = u" ".join(gtList) strGt = strGt.rstrip().strip() strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE) f = LMPreparationFormula() f.setText(u" ".join(testList)) f._normalizeUtf8() strResult = f.getText() self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))