def _compute_char_brkpoints(self): """ This function uses ICU BreakIterator to identify and store extended grapheme clusters. """ chars_break_iterator = BreakIterator.createCharacterInstance( Locale.getRoot()) chars_break_iterator.setText(self.unsegmented) self.char_brkpoints = [0] for brkpoint in chars_break_iterator: self.char_brkpoints.append(brkpoint)
def character_tokenize(self, word): """ Returns the tokenization in character level. Arguments: word {string} -- word to be tokenized in character level. Returns: [list] -- list of characters. """ temp_ = BreakIterator.createCharacterInstance(Locale()) temp_.setText(word) char = [] i = 0 for j in temp_: s = word[i:j] char.append(s) i = j return char