示例#1
0
 def setUp(self):
     dictdir  = os.path.join(RESOURCES_PATH, "vocab")
     vocabfile = os.path.join(dictdir, "fra.vocab")
     wds = WordsList(vocabfile)
     self.tok = DictTok(wds, "fra")
示例#2
0
# Automatic Tokenization is here:
# ----------------------------------------------------------------------------

base = os.path.basename( args.vocab )
lang = base[:3]

if args.i:
    p = sppasTok( args.vocab,lang )
    if args.std:
        p.set_std(True)
    p.run( args.i, args.o )

else:

    vocab = WordsList( args.vocab )
    tokenizer = DictTok( vocab,lang )

    try:
        repl = DictRepl(os.path.join(RESOURCES_PATH, "repl", lang + ".repl"), nodump=True)
        tokenizer.set_repl( repl )
    except Exception as e:
        print "[warning] No replacement dictionary: ",str(e)
    try:
        punct = WordsList(os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt"), nodump=True )
        tokenizer.set_punct( punct )
    except Exception as e:
        print "[warning] No punctuation dictionary: ",str(e)

    for line in sys.stdin:
        print tokenizer.tokenize( line ).encode('utf8')
示例#3
0
class TestDictTok(unittest.TestCase):

    def setUp(self):
        dictdir  = os.path.join(RESOURCES_PATH, "vocab")
        vocabfile = os.path.join(dictdir, "fra.vocab")
        wds = WordsList(vocabfile)
        self.tok = DictTok(wds, "fra")


    def test_num2letter(self):
        repl = DictRepl( os.path.join(RESOURCES_PATH, "repl", "fra.repl"), nodump=True)
        self.tok.set_repl( repl )
        self.tok.set_lang("fra")

        s =  self.tok.tokenize(u"123")
        self.assertEquals(s, u"cent-vingt-trois")

        s =  self.tok.tokenize(u"1,24")
        self.assertEquals(s, u"un virgule vingt-quatre")


    def test_stick(self):
        s =  self.tok.stick([u"123"])
        self.assertEquals(s, [u"123"])
        s =  self.tok.stick([u"au fur et à mesure"])
        self.assertEquals(s, [u"au_fur_et_à_mesure"])


    def test_replace(self):
        repl = DictRepl( os.path.join(RESOURCES_PATH, "repl", "fra.repl"), nodump=True )
        self.tok.set_repl( repl )
        s = self.tok.replace([u"un", u"taux", u"de", u"croissance", u"de", u"0,5", u"%"])
        self.assertEquals(s, [u"un", u"taux", u"de", u"croissance", u"de", u"0", u"virgule", u"5", u"pourcents"])

        text = [u"² % °c  km/h  etc   €  ¥ $ "]

        repl = DictRepl( os.path.join(RESOURCES_PATH, "repl", "eng.repl"), nodump=True )
        self.tok.set_repl( repl )
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u"square percent degrees_Celsius km/h etc euros yens dollars")

        repl = DictRepl( os.path.join(RESOURCES_PATH, "repl", "spa.repl"), nodump=True )
        self.tok.set_repl( repl )
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u"quadrados por_ciento grados_Celsius km/h etc euros yens dollars")

        repl = DictRepl( os.path.join(RESOURCES_PATH, "repl", "fra.repl"), nodump=True )
        self.tok.set_repl( repl )
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u"carrés pourcents degrés_celcius kilomètres_heure etcetera euros yens dollars")

        repl = DictRepl( os.path.join(RESOURCES_PATH, "repl", "ita.repl"), nodump=True )
        self.tok.set_repl( repl )
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u"quadrato percento gradi_Celsius km/h etc euros yens dollars")

        repl = DictRepl( os.path.join(RESOURCES_PATH, "repl", "cmn.repl"), nodump=True )
        self.tok.set_repl( repl )
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u"的平方 个百分比 摄氏度 公里每小时 etc € ¥ $")


    def test_clean_toe(self):
        s = self.tok.clean_toe(u'(il) (ne) faut pas rêver')
        self.assertEqual(s, u"faut pas rêver")

        s = self.tok.clean_toe(u'i(l) (ne) faut pas réver')
        self.assertEqual(s, u"i(l) faut pas réver")

        s = self.tok.clean_toe(u'i(l) (ne) faut pas réver')
        self.assertEqual(s, u"i(l) faut pas réver")

        s = self.tok.clean_toe(u' (il) faut pas réver i(l)')
        self.assertEqual(s, u"faut pas réver i(l)")

        s = self.tok.clean_toe(u' euh [je sais, ché] pas ')
        self.assertEqual(s, u"euh [je_sais,ché] pas")

        s = self.tok.clean_toe(u"  j'[ avais,  avé ] ")
        self.assertEqual(s, u"j' [avais,avé]")

        s = self.tok.clean_toe(u"  [j(e) sais,  ché ] ")
        self.assertEqual(s, u"[je_sais,ché]")

        s = self.tok.clean_toe(u"  [peut-êt(re),  pe êt] ")
        self.assertEqual(s, u"[peut-être,peêt]")

        s = self.tok.clean_toe(u" (pu)tai(n) j'ai")
        self.assertEqual(s, u"(pu)tai(n) j'ai")

        s = self.tok.clean_toe(u"gpd_100y en a un  qu(i) est devenu complèt(e)ment  ")
        self.assertEqual(s, u"gpd_100 y en a un qu(i) est devenu complèt(e)ment")

        s = self.tok.clean_toe(u"[$Londre, T/$, Londreu]")
        self.assertEqual(s, u"[Londre,Londreu]")

        s = self.tok.clean_toe(u"t(u) vois [$Isabelle,P /$, isabelleu] $Armelle,P /$ t(out) ça")
        self.assertEqual(s, u"t(u) vois [Isabelle,isabelleu] Armelle t(out) ça")

        s = self.tok.clean_toe(u"gpd_1324ah euh")
        self.assertEqual(s, u"gpd_1324 ah euh")


    def test_sampa(self):
        repl = DictRepl( os.path.join(RESOURCES_PATH, "repl", "fra.repl"), nodump=True )
        self.tok.set_repl( repl )

        s = self.tok.tokenize(u"[le mot,/lemot/]", std=False)
        self.assertEqual(u"/lemot/", s)
        s = self.tok.tokenize(u"[le mot,/lemot/]", std=True)
        self.assertEqual(u"le_mot", s)


    def test_tokenize(self):
        self.tok.set_lang("fra")
        splitfra = self.tok.tokenize(u"l'assiette l'abat-jour et le paris-brest et Paris-Marseille")
        self.assertEqual(splitfra, u"l' assiette l' abat-jour et le paris-brest et paris - marseille")


    def test_code_switching(self):
        dictdir  = os.path.join(RESOURCES_PATH, "vocab")
        vocabfra = os.path.join(dictdir, "fra.vocab")
        vocabcmn = os.path.join(dictdir, "cmn.vocab")

        wds = WordsList(vocabfra)
        wds.load_from_ascii( vocabcmn )
        self.assertEquals( wds.get_size(), 434333)

        self.tok.set_vocab( wds )
        splitswitch = self.tok.tokenize(u'et il m\'a dit : "《干脆就把那部蒙人的闲法给废了拉倒!》RT @laoshipukong : 27日"')
        self.assertEqual(splitswitch, u"et il m' a dit 干脆 就 把 那 部 蒙 人 的 闲 法 给 废 了 拉倒 rt @ laoshipukong 二十七 日")