def __call__(self, string, column=Profile.GRAPHEME_COL, form=None, ipa=False, segment_separator=' ', separator=' # ', errors='replace'): """ The main task of a Tokenizer is tokenizing! This is what happens when called. This function determines what to do given any combination of orthography profile and rules or not orthography profile or rules. Parameters ---------- string : str The input string to be tokenized. column : str (default = "graphemes") The column label for the transformation, if specified. form : None or unicode normalization form Normalize return value if form is not None. ipa : bool Tokenize IPA (work in progress) Returns ------- result : str Result of the tokenization. """ res = [] for word in string.split(): if ipa: res.append( self.combine_modifiers(self.grapheme_clusters(nfd(word)))) else: if self.op: res.append( self.transform(word, column=column, error=self._errors[errors])) else: res.append(self.grapheme_clusters(nfd(word))) def pp(word): res = segment_separator.join(word).strip() res = self._rules.apply(res) if self._rules else res return unicodedata.normalize(form, res) if form else res return separator.join(pp(word) for word in res)
def characters( self, string, segment_separator=' ', separator=' # ', ): """ Given a string as input, return a space-delimited string of Unicode characters (code points rendered as glyphs). Parameters ---------- string : str A Unicode string to be tokenized into graphemes. Returns ------- result : str String returned is space-delimited on Unicode characters and contains "#" to mark word boundaries. The string is in NFD. Notes ----- Input is first normalized according to Normalization Ford D(ecomposition). String returned contains "#" to mark word boundaries. """ return separator.join( segment_separator.join(word) for word in nfd(string).split())