def gen_khm_words(text: str) -> str: bi = BreakIterator.createWordInstance(Locale("km")) bi.setText(text) start = bi.first() for end in bi: yield text[start:end] start = end
def _gen_words(text: str) -> str: bd = BreakIterator.createWordInstance(Locale("th")) bd.setText(text) p = bd.first() for q in bd: yield text[p:q] p = q
def divideIntoWords(txt, locale): loc = Locale.createFromName(locale) bi = BreakIterator.createWordInstance(loc) #print txt bi.setText(txt) res = [] while True: try: #print bi.next() res.append(bi.next()) except StopIteration: return res
def _compute_icu_segmented(self): """ This function computes the ICU segmented version of the line using the unsegmented version. Therefore, in order to use it the unsegmented version must have been already computed. """ words_break_iterator = BreakIterator.createWordInstance( Locale.getRoot()) words_break_iterator.setText(self.unsegmented) self.icu_word_brkpoints = [0] for brkpoint in words_break_iterator: self.icu_word_brkpoints.append(brkpoint) self.icu_segmented = "|" for i in range(len(self.icu_word_brkpoints) - 1): self.icu_segmented += self.unsegmented[ self.icu_word_brkpoints[i]:self.icu_word_brkpoints[i + 1]] + "|"
def endElement(self, name): if name == u"Unicode": self.__isUni = False loc = Locale.createFromName("utf-8") bi = BreakIterator.createWordInstance(loc) bi.setText(self.__uniText) tokens = [] prev = 0 while True: try: ind = bi.next() tokens.append(self.__uniText[prev:ind]) prev = ind except StopIteration: break text = u"" for t in tokens: text += processToken(t) self.__downstream.characters(text) self.__downstream.endElement(name)
def main(): print "ICU Break Iterator Sample Program" print "C++ Break Iteration in Python" stringToExamine = u"Aaa bbb ccc. Ddd eee fff." print "Examining: ", stringToExamine # print each sentence in forward and reverse order boundary = BreakIterator.createSentenceInstance(Locale.getUS()) boundary.setText(stringToExamine) print print "Sentence Boundaries... " print "----- forward: -----------" printEachForward(boundary) print "----- backward: ----------" printEachBackward(boundary) # print each word in order print print "Word Boundaries..." boundary = BreakIterator.createWordInstance(Locale.getUS()) boundary.setText(stringToExamine) print "----- forward: -----------" printEachForward(boundary) # print first element print "----- first: -------------" printFirst(boundary) # print last element print "----- last: --------------" printLast(boundary) # print word at charpos 10 print "----- at pos 10: ---------" printAt(boundary, 10) print print "End C++ Break Iteration in Python"
def __init__(self, locale='en'): super(WordTokenizer, self).__init__(locale) self.breaker = BreakIterator.createWordInstance(self.locale)
def __init__(self): self.BreakIterator = BreakIterator.createWordInstance( Locale.createFromName('ar'))
def __init__(self): self.locale = Locale("tr") self.breakor = BreakIterator.createWordInstance(self.locale)
def _get_breaker(self, locale): return BreakIterator.createWordInstance(locale)