Exemplo n.º 1
0
    def testGerman(self):
        testList = [(r"emmaüs", r"emmaüs"),
                    ("mein àrbeit", "mein àrbeit"),
                    (r"môchten", r"môchten"),
                    (r"mädchen", r"mädchen"),
                    (r"meîn", r"meîn"),
                    (r"meïn", r"meïn"),
                    (r"18-jähriger", r"achtzehn jähriger")]

        # No new words are kepts
        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        # New words are kept
        testList = [(r"18-jähriger", r"18-jähriger")]
        f.setExpandNumberInWords(False)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 2
0
    def testNormalizeUtf8(self):
        languages = ['0', '1', '2']
        testList = {}
        for lang in languages:
            testList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): testList[lang].append(match)

        gtList = {}
        for lang in languages:
            gtList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): gtList[lang].append(sub)

        for lang in languages:
            strGt = u" ".join(gtList[lang])
            strGt = strGt.rstrip().strip()
            strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE)

            f = LMPreparationFormula()
            f.setText(u" ".join(testList[lang]))
            f._normalizeUtf8()
            strResult = f.getText()

            self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))
Exemplo n.º 3
0
    def testFrench(self):
        testList = [(r"à plus tard", r"à plus tard"),
                    (r"maîtres", r"maîtres"),
                    (r"maïs", r"maïs"),
                    (r"emmaüs", r"emmaüs"),
                    (r"mäman", r"mäman"),
                    (r"1er", r"premier"),
                    (r"20ème", r"vingtième"),
                    (r"18-age", r"dix huit age")]

        # No new words are kepts, hyphens are removed
        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        # Keep new words implies keep hyphens in words
        f.setExpandNumberInWords(False)

        testList = [(r"18-age", r"18-age")]
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 4
0
 def testExpandAbbreviations(self):
     f = LMPreparationFormula()
     for languageId, v in ABBREVIATIONS.items():
         f.setLanguageId(languageId)
         for abbr, gt in v.items():
             f.strText = abbr
             f._expandAbbreviations()
             self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8'))
Exemplo n.º 5
0
    def testNormalizeCharacters(self):
        strTest = ur"a b c \uff1b , % œ"
        strGt = ur"a b c % oe"

        f = LMPreparationFormula()
        f.setText(strTest)
        f._normalizeUtf8()
        f._normalizePunctuation(self.allPunctList)
        self.assertEquals(strGt, f.getText())
Exemplo n.º 6
0
    def testFilterNoiseWords(self):
        strTest = u"!-?- hello how !!!! are you *-+$"
        strGt = u"hello how are you"

        f = LMPreparationFormula()
        f.setText(strTest)
        strTest = f._filterNoiseWords()

        self.assertEquals(strGt, strTest)
Exemplo n.º 7
0
    def testNormalizePunctuationKeepInWords(self):
        f = LMPreparationFormula()
        f.setKeepNewWords(True)

        f.setText(u"".join("/ HES-SO und AdG/LA - auch im Winter / Sommer -"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "HES-SO und AdG/LA auch im Winter Sommer"
        self.assertEquals(gt, strResult)
Exemplo n.º 8
0
    def testFrench(self):
        testList = [(ur"à plus tard", ur"à plus tard"),
                    (ur"maîtres", ur"maîtres"), (ur"maïs", ur"maïs"),
                    (ur"emmaüs", ur"emmaüs"), (ur"mäman", ur"mäman")]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 9
0
    def testGerman(self):
        testList = [(ur"emmaüs", ur"emmaüs"), (u"mein àrbeit", u"mein àrbeit"),
                    (ur"môchten", ur"môchten"), (ur"mädchen", ur"mädchen"),
                    (ur"meîn", ur"meîn"), (ur"meïn", ur"meïn")]

        f = LMPreparationFormula()
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 10
0
    def testContractionPrefixes(self):
        testList = [(r"President' s", r"president's", 3),
                    (r"President' s of", r"president's of", 3)]

        f = LMPreparationFormula()
        f.setExpandNumberInWords(False)

        for t, gt, languageId in testList:
            f.setLanguageId(languageId)
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 11
0
    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText(u"".join(string.punctuation + u"‰"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = u"$%&'@\u2030"
        self.assertEquals(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et ' at pour mille"
        self.assertEquals(gt, strResult)
Exemplo n.º 12
0
    def testAll(self):
        testList =[(u"A dix heures", u"à dix heures"),
                   (u"1. Election",u"premièrement election"),
                   (u"R1",u"r. un"), (ur"A1", ur"a. un"),(ur"P3B", ur"p. trois b."),
                   (ur"P5B4", ur"p. cinq b. quatre"), 
                   (ur"PPB5",ur"p. p. b.  cinq"),
                   (ur"rte",ur"route"),
                   (ur"Constantin, p. l. r., président de",ur"constantin p. l. r. président de")]

        f = LMPreparationFormula()
        f.setLanguageId(1)
        
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 13
0
    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText("".join(string.punctuation + "‰"))
        f.setExpandNumberInWords(False)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "$%&'-/@‰"
        self.assertEqual(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et '-/ at pour mille"
        self.assertEqual(gt, strResult)
Exemplo n.º 14
0
    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()
Exemplo n.º 15
0
    def testExpandNumberInWords(self):
        testList = [(r"A1", r"A. 1"), (r"P3B", r"P. 3 B."), (r"P5B4", r"P. 5 B. 4"),
                    (r"PPB5", r"PPB 5"), (r"10jährige", r"10 jährige")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)

        f.setExpandNumberInWords(False)
        testList = [(r"1er", r"1er")]
        f.setLanguageId(1)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(r"1st", r"1st")]
        f.setLanguageId(3)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(r"18-jähriger", r"18 -jähriger")]
        f.setLanguageId(2)
        self.verifyEqual(testList, f, f._expandNumberInWords)
Exemplo n.º 16
0
    def testNormalizeUtf8(self):
        testList = []
        for match, sub, comment, languageId in UTF8MAP:
            testList.append(match)

        gtList = []
        for match, sub, comment, languageId in UTF8MAP:
            gtList.append(sub)

        strGt = u" ".join(gtList)
        strGt = strGt.rstrip().strip()
        strGt = re.sub(SPACEPATTERN, u" ", 
                        strGt, flags=re.UNICODE)

        f = LMPreparationFormula()
        f.setText(u" ".join(testList))
        f._normalizeUtf8()
        strResult = f.getText()

        self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))
Exemplo n.º 17
0
    def testEnglish(self):
        testList = [(r"object 5", r"object five"),
                    (r"1st", r"first")]

        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(3)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        testList = [(r"18-year-old", r"18-year-old")]
        f.setExpandNumberInWords(False)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 18
0
    def testAll(self):
        testList = [("A dix heures", "à dix heures", False),
                    ("1. Election", "premièrement election", False),
                    ("R1", "r. un", False), (r"A1", r"a. un",
                                             False), (r"P3B", r"p. trois b.", False),
                    (r"P5B4", r"p. cinq b. quatre", False),
                    (r"PPB5", r"p. p. b. cinq", False),
                    (r"rte", r"route", False),
                    (r"Constantin, p. l. r., président de",
                     r"constantin p. l. r. président de", False),
                    (r"/ HES-SO und AdG/LA - auch im Winter / Sommer -", r"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setExpandNumberInWords(not knw)
            r = f.prepareText()
            self.assertEqual(gt, r)
Exemplo n.º 19
0
    def testAll(self):
        testList = [(u"A dix heures", u"à dix heures", False),
                    (u"1. Election", u"premièrement election", False),
                    (u"R1", u"r. un", False), (ur"A1", ur"a. un", False),
                    (ur"P3B", ur"p. trois b.", False),
                    (ur"P5B4", ur"p. cinq b. quatre", False),
                    (ur"PPB5", ur"p. p. b.  cinq", False),
                    (ur"rte", ur"route", False),
                    (ur"Constantin, p. l. r., président de",
                     ur"constantin p. l. r. président de", False),
                    (ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -",
                     ur"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setKeepNewWords(knw)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 20
0
    def testExpandNumberInWords(self):
        testList = [(ur"A1", ur"A. 1"), (ur"P3B", ur"P. 3 B."),
                    (ur"P5B4", ur"P. 5 B. 4"), (ur"PPB5", ur"PPB 5")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)
Exemplo n.º 21
0
    def testExpandAcronyms(self):
        testList = [(u"PDCB.", u"p. d. c. b."), (u"PDC:", u"p. d. c.")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandAcronyms)