Exemplo n.º 1
0
    def test_easy_first(self):
        word3 = Word(u"لي")
        self.assertEqual(word3.stems, [(Prefix(u"ل"), u"ي", Suffix(u""))])

        word4 = Word(u"لكم")
        self.assertEqual(word4.stems, [(Prefix(u"ل"), u"كم", Suffix(u""))])
        word5 = Word(u"لكما")
        self.assertEqual(word5.stems, [(Prefix(u"ل"), u"كما", Suffix(u""))])
Exemplo n.º 2
0
    def test_prefix_DT(self):
        result = prefix_DT_no_suffix(Prefix(u'', classe=u'', desc=u''), u'', Suffix(u'', classe=u'', desc=u''))
        self.assertTrue(result)

        result = prefix_DT_no_suffix(Prefix(u'وال', classe=u'pN2', desc=u'CC+DT'), u'', Suffix(u'', classe=u'', desc=u''))
        self.assertTrue(result)

        result = prefix_DT_no_suffix(Prefix(u'ال', classe=u'pN1', desc=u'DT'), u'', Suffix(u'', classe=u'', desc=u''))
        self.assertTrue(result)

        result = prefix_DT_no_suffix(Prefix(u'وال', classe=u'pN2', desc=u'CC+DT'), u'', Suffix("ي", desc="PRP|OBJP", classe="sC2"))
        self.assertFalse(result)
Exemplo n.º 3
0
    def test_valid_len(self):
        result = has_valid_len(Prefix(u'', classe=u'', desc=u''), u'', Suffix(u'', classe=u'', desc=u''))
        self.assertFalse(result)

        result = has_valid_len(Prefix(u'', classe=u'', desc=u''), u'ا', Suffix(u'', classe=u'', desc=u''))
        self.assertFalse(result)

        result = has_valid_len(Prefix(u'', classe=u'', desc=u''), u'بحباني', Suffix(u'', classe=u'', desc=u''))
        self.assertTrue(result)

        result = has_valid_len(Prefix(u'و', classe=u'pC2', desc=u'CC'), u'يعرب', Suffix(u'', classe=u'', desc=u''))
        self.assertTrue(result)
Exemplo n.º 4
0
    def test_fixer(self):
        result = fix_stem((Prefix(u''), u'', Suffix(u'')))
        self.assertEqual(result, (Prefix(u''), u'', Suffix(u'')))

        result = fix_stem((Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
        self.assertEqual(result, (Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1')))

        result = fix_stem((Prefix(u'', classe=u'pC1'), u'لاجئ', Suffix(u'نا', classe=u'sC3')))
        self.assertEqual( result, (Prefix(u'', classe=u'pC1'), u'لاجء', Suffix(u'نا', classe=u'sC3')))

        result = fix_stem((Prefix(u'لل', classe=u'pC1'), u'لاجئ', Suffix(u'نا', classe=u'Sc3')))
        self.assertEqual(result, (Prefix(u'ل ال', classe=u'pC1'), u'لاجء', Suffix(u'نا', classe=u'Sc3')))

        result = fix_stem((Prefix(u'لل', classe=u'pN13'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
        self.assertEqual(result, (Prefix(u'ل ال', classe=u'pN13'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
Exemplo n.º 5
0
    def test_segment_words(self):
        '''
        test segment words to all possible prefixes, circufixes, and suffixes
        '''

        word = Word(u"")
        self.assertEqual(word.string, u"")
        self.assertEqual(word.stems, [])

        word2 = Word(u"الله")
        self.assertEqual([case for case in word2], [
            (Prefix(u'', classe=u'pC1'), u'الله', Suffix(u'', classe=u'sC1')),
            (Prefix(u'', classe=u'pC1'), u'الل', Suffix(u'ه', classe=u'sC10')),
            (Prefix(u'ال', classe=u'pN1'), u'له', Suffix(u'', classe=u'sC1')),
        ])
Exemplo n.º 6
0
    def test_tag(self):
        processed = tag(u'install')
        self.assertEqual([[u'install', u'NN']], processed)

        processed = tag(u'1- بسمك اللهم، :)')

        self.assertEqual(processed[0], [u'1', u'CD'])
        self.assertEqual(processed[1], [u'-', u'PUNC'])
        self.assertEqual(processed[2], [
            (Prefix(u'', classe=u'pC1'), u'بسمك', Suffix(u'', classe=u'sC1')),
            (Prefix(u'', classe=u'pC1'), u'بسم',
             Suffix(u'ك', classe=u'sC4', desc="PRP|OBJP")),
            (Prefix(u'ب', classe=u'pN25',
                    desc="IN"), u'سمك', Suffix(u'', classe=u'sC1')),
            (Prefix(u'ب', classe=u'pN25', desc="IN"), u'سم',
             Suffix(u'ك', classe=u'sC4', desc="PRP|OBJP")),
        ])
        self.assertEqual(processed[3], [
            (Prefix(u'', classe=u'pC1',
                    desc=u''), u'اللهم', Suffix(u'', classe=u'sC1', desc=u'')),
            (Prefix(u'', classe=u'pC1', desc=u''), u'الل',
             Suffix(u'هم', classe=u'sC13', desc=u"PRP|OBJP")),
            (Prefix(u'ال', classe=u'pN1',
                    desc=u"DT"), u'لهم', Suffix(u'', classe=u'sC1', desc=u'')),
        ])
        self.assertEqual(processed[4], [u'،', u'PUNC'])
        self.assertEqual(processed[5], [u':)', u'EMO'])
Exemplo n.º 7
0
    def test_hamza_fix(self):
        result = hamza_fix((Prefix(u''), u'', Suffix(u'')))
        self.assertEqual(result, (Prefix(u''), u'', Suffix(u'')))

        result = hamza_fix((Prefix(u'', classe=u'pC1'), u'لاجئ', Suffix(u'نا', classe=u'sC3')))
        self.assertEqual( result, (Prefix(u'', classe=u'pC1'), u'لاجء', Suffix(u'نا', classe=u'sC3')))

        result = hamza_fix((Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
        self.assertEqual(result, (Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
Exemplo n.º 8
0
    def test_lam_lam_fix(self):
        result = lam_lam_fix((Prefix(u''), u'', Suffix(u'')))
        self.assertEqual(result, (Prefix(u''), u'', Suffix(u'')))


        result = lam_lam_fix((Prefix(u'لل', classe=u'pN13'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
        self.assertEqual(result, (Prefix(u'ل ال', classe=u'pN13'), u'تلفزيون', Suffix(u'', classe=u'pC1')))

        result = lam_lam_fix((Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
        self.assertEqual(result, (Prefix(u'ال', classe=u'pN1'), u'تلفزيون', Suffix(u'', classe=u'pC1')))
Exemplo n.º 9
0
 def test_variations(self):
     word = Word(u'مدرستها')
     self.assertEqual(word.stems,
                      [(Prefix(u'', classe=u'', desc=u''), u'مدرستها',
                        Suffix(u'', classe=u'', desc=u'')),
                       (Prefix(u'', classe=u'', desc=u''), u'مدرست',
                        Suffix(u'ها', classe=u'sC11', desc=u'PRP|OBJP')),
                       (Prefix(u'', classe=u'', desc=u''), u'مدرسة',
                        Suffix(u'ها', classe=u'sC11', desc=u'PRP|O,BJP')),
                       (Prefix(u'م', classe=u'pV1', desc=u'NG'), u'درستها',
                        Suffix(u'', classe=u'', desc=u'')),
                       (Prefix(u'م', classe=u'pV1', desc=u'NG'), u'درست',
                        Suffix(u'ها', classe=u'sC11', desc=u'PRP|OBJP')),
                       (Prefix(u'م', classe=u'pV1', desc=u'NG'), u'درسة',
                        Suffix(u'ها', classe=u'sC11', desc=u'PRP|OBJP'))])
Exemplo n.º 10
0
    ":-?": "teasing/playful",
    ":-b": "teasing/playful",
    ":b": "teasing/playful",
    ";)": "wink",
    u"º)": "wink",
    ";-)": "wink",
    ";]": "wink",
    u"^Ü^": "happy",
}

special_tokens = EMOTICONS

from DAPOS.data.variation import Prefix, Suffix

EASY_WORDS = {
    u"ليا":  [(Prefix(u"ل"), u"يا",  Suffix(u""))],
    u"لي":   [(Prefix(u"ل"), u"ي",   Suffix(u""))],
    u"لكم":  [(Prefix(u"ل"), u"كم",  Suffix(u""))],
    u"لكما": [(Prefix(u"ل"), u"كما", Suffix(u""))],
    u"له":   [(Prefix(u"ل"), u"ه",   Suffix(u""))],
    u"لها":  [(Prefix(u"ل"), u"ها",  Suffix(u""))],
    u"لهم":  [(Prefix(u"ل"), u"هم",  Suffix(u""))],
    u"لهما": [(Prefix(u"ل"), u"هما", Suffix(u""))],
    u"لهن":  [(Prefix(u"ل"), u"هم",  Suffix(u""))],
    u"بيا":  [(Prefix(u"ب"), u"يا",  Suffix(u""))],
    u"بي":   [(Prefix(u"ب"), u"ي",   Suffix(u""))],
    u"بك":   [(Prefix(u"ب"), u"ك",   Suffix(u""))],
    u"بكم":  [(Prefix(u"ب"), u"كم",  Suffix(u""))],
    u"بكما": [(Prefix(u"ب"), u"كما", Suffix(u""))],
    u"به":   [(Prefix(u"ب"), u"ه",   Suffix(u""))],
    u"بها":  [(Prefix(u"ب"), u"ها",  Suffix(u""))],