def __init__(self): # Maybe in new versions of PyICU the following # (now commented out) shorthand function is defined. # self.normalizer_nfc = Normalizer2.getNFCInstance() # Since it is not, use the non-shorthand function with the needed parameters self.normalizer_nfc = Normalizer2.getInstance(None, 'nfc', UNormalizationMode2.COMPOSE) self.normalizer_nfd = Normalizer2.getInstance(None, 'nfc', UNormalizationMode2.DECOMPOSE) self.normalizer_nfkc = Normalizer2.getInstance(None, 'nfkc', UNormalizationMode2.COMPOSE) self.normalizer_nfkd = Normalizer2.getInstance(None, 'nfkc', UNormalizationMode2.DECOMPOSE)
def createComponents(_self, fieldName): source = WhitespaceTokenizer() return Analyzer.TokenStreamComponents( source, ICUNormalizer2Filter( source, Normalizer2.getInstance( None, "nfc", UNormalizationMode2.DECOMPOSE)))
def __init__(self, input, normalizer=None): super(ICUNormalizer2Filter, self).__init__(input) self.input = input self.termAtt = self.addAttribute(CharTermAttribute.class_); if normalizer is None: normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE) self.normalizer = normalizer
def __init__(self, input, normalizer=None): super(ICUNormalizer2Filter, self).__init__(input) self.input = input self.termAtt = self.addAttribute(CharTermAttribute.class_) if normalizer is None: normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE) self.normalizer = normalizer
def testNormalize(self): try: from icu import Normalizer2 except ImportError: return normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE) self.assertTrue(normalizer.normalize("Hi There") == u'hi there') a = UnicodeString() normalizer.normalize("Hi There", a) self.assertTrue(a == UnicodeString(u'hi there'))
def testNormalize(self): try: from icu import Normalizer2 except ImportError: return self.assertNorm( Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE), u'hi there', "Hi There") self.assertNorm(Normalizer2.getNFCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCCasefoldInstance(), u"ässáw", u"äßa\u0301W")
def testNormalize(self): try: from icu import Normalizer2 except ImportError: return self.assertNorm(Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE), u'hi there', "Hi There") self.assertNorm(Normalizer2.getNFCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCInstance(), u"äßáW", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKDInstance(), u"a\u0308ßa\u0301W", u"äßa\u0301W") self.assertNorm(Normalizer2.getNFKCCasefoldInstance(), u"ässáw", u"äßa\u0301W")
def tokenStream(_self, fieldName, reader): return ICUNormalizer2Filter( WhitespaceTokenizer(Version.LUCENE_CURRENT, reader), Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE))
def __init__(self, input): normalizer = Normalizer2.getInstance("utr30", "utr30", UNormalizationMode2.COMPOSE) super(ICUFoldingFilter, self).__init__(input, normalizer)
def tokenStream(_self, fieldName, reader): return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader), Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE))
def createComponents(_self, fieldName): source = WhitespaceTokenizer() return Analyzer.TokenStreamComponents( source, ICUNormalizer2Filter( source, Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE)))