def testNormalize(self): try: from icu import Normalizer2 except ImportError: return self.assertNorm( Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE), u'hi there', "Hi There") self.assertNorm(Normalizer2.getNFCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCCasefoldInstance(), u"ässáw", u"äßa\u0301W")
def testNormalize(self): try: from icu import Normalizer2 except ImportError: return self.assertNorm(Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE), u'hi there', "Hi There") self.assertNorm(Normalizer2.getNFCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCCasefoldInstance(), u"ässáw", u"äßa\u0301W")
from icu import Normalizer2 composer = Normalizer2.getNFCInstance() decomposer = Normalizer2.getNFDInstance() def compDecomp(orig): composed = composer.normalize(orig) decomposed = decomposer.normalize(orig) print(f"{orig} {composed} {decomposed}") compDecomp('lội')
def to_latin(string, locale=locale): ustring = UnicodeString(string) nfc = Normalizer2.getNFCInstance() ustring = nfc.normalize(ustring) trans = Transliterator.createFromRules( "", "$wb = [^[:Letter:]] ;" # е "$wb { е > ye ;" "[ыq] { е } $wb > e ;" "[уеёыаоэяиюьъiuoeaq] { е > ye ;" "е > e ;" # э "$wb { э > e ;" "[жшцйjwcy] { э > е ;" "э > qe ;" # ы "[жшцйjwcy] { ы > i ;" "ы > q ;" # ё "$wb { ё > yo ;" "[жшцйjwcy] { ё > o ;" "[уеёыаоэяиюьъiuoeaq] { ё > yo ;" "ё > ho ;" # ю "$wb { ю > yu ;" "[жшцйjwcy] { ю > u ;" "[уеёыаоэяиюьъiuoeaq] { ю > yu ;" "ю > hu ;" # я "$wb { я > ya ;" "[жшцйjwcy] { я > a ;" "[уеёыаоэяиюьъiuoeaq] { я > ya ;" "я > ha ;" # Буквосочетание ьо, только в заимствованных "ньо > nyo ;" "льо > lyo ;" "мьо > myo ;" "рьо > ryo ;" # Остальные буквы "а > a ;" "б > b ;" "в > v ;" "г > g ;" "д > d ;" "ж > j ;" "з > z ;" "и > i ;" "й > y ;" "к > k ;" "л > l ;" "м > m ;" "н > n ;" "о > o ;" "п > p ;" "р > r ;" "с > s ;" "т > t ;" "у > u ;" "ф > f ;" "х > x ;" "ц > c ;" "ч > ch ;" "ш > w ;" "щ > wh ;" # Проход с начала ":: Any-Null ;" "[nlmr] { ь } y[aueioq] > ;" "ь > h ;" "[nlmr] { ъ } y[aueioq] > y;" "ъ > ;" # Проход с начала ":: Any-Null ;" "h+ > h ;") ustring = trans.transliterate(ustring) return ustring