Exemplos de LMPreparationFormula.LMPreparationFormula em Python, exemplos de asrt.common.formula.FormulaLMPreparation.LMPreparationFormula.LMPreparationFormula em Python

Exemplo n.º 1

0

Exibir arquivo

    def testGerman(self):
        testList = [(r"emmaüs", r"emmaüs"),
                    ("mein àrbeit", "mein àrbeit"),
                    (r"môchten", r"môchten"),
                    (r"mädchen", r"mädchen"),
                    (r"meîn", r"meîn"),
                    (r"meïn", r"meïn"),
                    (r"18-jähriger", r"achtzehn jähriger")]

        # No new words are kepts
        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        # New words are kept
        testList = [(r"18-jähriger", r"18-jähriger")]
        f.setExpandNumberInWords(False)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 2

0

Exibir arquivo

    def testNormalizeUtf8(self):
        languages = ['0', '1', '2']
        testList = {}
        for lang in languages:
            testList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): testList[lang].append(match)

        gtList = {}
        for lang in languages:
            gtList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): gtList[lang].append(sub)

        for lang in languages:
            strGt = u" ".join(gtList[lang])
            strGt = strGt.rstrip().strip()
            strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE)

            f = LMPreparationFormula()
            f.setText(u" ".join(testList[lang]))
            f._normalizeUtf8()
            strResult = f.getText()

            self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))

Exemplo n.º 3

0

Exibir arquivo

    def testFrench(self):
        testList = [(r"à plus tard", r"à plus tard"),
                    (r"maîtres", r"maîtres"),
                    (r"maïs", r"maïs"),
                    (r"emmaüs", r"emmaüs"),
                    (r"mäman", r"mäman"),
                    (r"1er", r"premier"),
                    (r"20ème", r"vingtième"),
                    (r"18-age", r"dix huit age")]

        # No new words are kepts, hyphens are removed
        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        # Keep new words implies keep hyphens in words
        f.setExpandNumberInWords(False)

        testList = [(r"18-age", r"18-age")]
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: FormulaLMPreparationUnitTest.py Projeto: hdubey/asrt

 def testExpandAbbreviations(self):
     f = LMPreparationFormula()
     for languageId, v in ABBREVIATIONS.items():
         f.setLanguageId(languageId)
         for abbr, gt in v.items():
             f.strText = abbr
             f._expandAbbreviations()
             self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8'))

Exemplo n.º 5

0

Exibir arquivo

    def testNormalizeCharacters(self):
        strTest = ur"a b c \uff1b , % œ"
        strGt = ur"a b c % oe"

        f = LMPreparationFormula()
        f.setText(strTest)
        f._normalizeUtf8()
        f._normalizePunctuation(self.allPunctList)
        self.assertEquals(strGt, f.getText())

Exemplo n.º 6

0

Exibir arquivo

    def testFilterNoiseWords(self):
        strTest = u"!-?- hello how !!!! are you *-+$"
        strGt = u"hello how are you"

        f = LMPreparationFormula()
        f.setText(strTest)
        strTest = f._filterNoiseWords()

        self.assertEquals(strGt, strTest)

Exemplo n.º 7

0

Exibir arquivo

    def testNormalizePunctuationKeepInWords(self):
        f = LMPreparationFormula()
        f.setKeepNewWords(True)

        f.setText(u"".join("/ HES-SO und AdG/LA - auch im Winter / Sommer -"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "HES-SO und AdG/LA auch im Winter Sommer"
        self.assertEquals(gt, strResult)

Exemplo n.º 8

0

Exibir arquivo

    def testFrench(self):
        testList = [(ur"à plus tard", ur"à plus tard"),
                    (ur"maîtres", ur"maîtres"), (ur"maïs", ur"maïs"),
                    (ur"emmaüs", ur"emmaüs"), (ur"mäman", ur"mäman")]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 9

0

Exibir arquivo

    def testGerman(self):
        testList = [(ur"emmaüs", ur"emmaüs"), (u"mein àrbeit", u"mein àrbeit"),
                    (ur"môchten", ur"môchten"), (ur"mädchen", ur"mädchen"),
                    (ur"meîn", ur"meîn"), (ur"meïn", ur"meïn")]

        f = LMPreparationFormula()
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 10

0

Exibir arquivo

    def testContractionPrefixes(self):
        testList = [(r"President' s", r"president's", 3),
                    (r"President' s of", r"president's of", 3)]

        f = LMPreparationFormula()
        f.setExpandNumberInWords(False)

        for t, gt, languageId in testList:
            f.setLanguageId(languageId)
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 11

0

Exibir arquivo

    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText(u"".join(string.punctuation + u"‰"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = u"$%&'@\u2030"
        self.assertEquals(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et ' at pour mille"
        self.assertEquals(gt, strResult)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: FormulaLMPreparationUnitTest.py Projeto: hdubey/asrt

    def testAll(self):
        testList =[(u"A dix heures", u"à dix heures"),
                   (u"1. Election",u"premièrement election"),
                   (u"R1",u"r. un"), (ur"A1", ur"a. un"),(ur"P3B", ur"p. trois b."),
                   (ur"P5B4", ur"p. cinq b. quatre"), 
                   (ur"PPB5",ur"p. p. b.  cinq"),
                   (ur"rte",ur"route"),
                   (ur"Constantin, p. l. r., président de",ur"constantin p. l. r. président de")]

        f = LMPreparationFormula()
        f.setLanguageId(1)
        
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 13

0

Exibir arquivo

    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText("".join(string.punctuation + "‰"))
        f.setExpandNumberInWords(False)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "$%&'-/@‰"
        self.assertEqual(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et '-/ at pour mille"
        self.assertEqual(gt, strResult)

Exemplo n.º 14

0

Exibir arquivo

    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()

Exemplo n.º 15

0

Exibir arquivo

    def testExpandNumberInWords(self):
        testList = [(r"A1", r"A. 1"), (r"P3B", r"P. 3 B."), (r"P5B4", r"P. 5 B. 4"),
                    (r"PPB5", r"PPB 5"), (r"10jährige", r"10 jährige")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)

        f.setExpandNumberInWords(False)
        testList = [(r"1er", r"1er")]
        f.setLanguageId(1)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(r"1st", r"1st")]
        f.setLanguageId(3)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(r"18-jähriger", r"18 -jähriger")]
        f.setLanguageId(2)
        self.verifyEqual(testList, f, f._expandNumberInWords)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: FormulaLMPreparationUnitTest.py Projeto: hdubey/asrt

    def testNormalizeUtf8(self):
        testList = []
        for match, sub, comment, languageId in UTF8MAP:
            testList.append(match)

        gtList = []
        for match, sub, comment, languageId in UTF8MAP:
            gtList.append(sub)

        strGt = u" ".join(gtList)
        strGt = strGt.rstrip().strip()
        strGt = re.sub(SPACEPATTERN, u" ", 
                        strGt, flags=re.UNICODE)

        f = LMPreparationFormula()
        f.setText(u" ".join(testList))
        f._normalizeUtf8()
        strResult = f.getText()

        self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))

Exemplo n.º 17

0

Exibir arquivo

    def testEnglish(self):
        testList = [(r"object 5", r"object five"),
                    (r"1st", r"first")]

        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(3)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        testList = [(r"18-year-old", r"18-year-old")]
        f.setExpandNumberInWords(False)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 18

0

Exibir arquivo

    def testAll(self):
        testList = [("A dix heures", "à dix heures", False),
                    ("1. Election", "premièrement election", False),
                    ("R1", "r. un", False), (r"A1", r"a. un",
                                             False), (r"P3B", r"p. trois b.", False),
                    (r"P5B4", r"p. cinq b. quatre", False),
                    (r"PPB5", r"p. p. b. cinq", False),
                    (r"rte", r"route", False),
                    (r"Constantin, p. l. r., président de",
                     r"constantin p. l. r. président de", False),
                    (r"/ HES-SO und AdG/LA - auch im Winter / Sommer -", r"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setExpandNumberInWords(not knw)
            r = f.prepareText()
            self.assertEqual(gt, r)

Exemplo n.º 19

0

Exibir arquivo

    def testAll(self):
        testList = [(u"A dix heures", u"à dix heures", False),
                    (u"1. Election", u"premièrement election", False),
                    (u"R1", u"r. un", False), (ur"A1", ur"a. un", False),
                    (ur"P3B", ur"p. trois b.", False),
                    (ur"P5B4", ur"p. cinq b. quatre", False),
                    (ur"PPB5", ur"p. p. b.  cinq", False),
                    (ur"rte", ur"route", False),
                    (ur"Constantin, p. l. r., président de",
                     ur"constantin p. l. r. président de", False),
                    (ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -",
                     ur"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setKeepNewWords(knw)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 20

0

Exibir arquivo

    def testExpandNumberInWords(self):
        testList = [(ur"A1", ur"A. 1"), (ur"P3B", ur"P. 3 B."),
                    (ur"P5B4", ur"P. 5 B. 4"), (ur"PPB5", ur"PPB 5")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)

Exemplo n.º 21

0

Exibir arquivo

    def testExpandAcronyms(self):
        testList = [(u"PDCB.", u"p. d. c. b."), (u"PDC:", u"p. d. c.")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandAcronyms)