Exemplo n.º 1
0
    def __call__(self,
                 string,
                 column=Profile.GRAPHEME_COL,
                 form=None,
                 ipa=False,
                 segment_separator=' ',
                 separator=' # ',
                 errors='replace'):
        """
        The main task of a Tokenizer is tokenizing! This is what happens when called.

        This function determines what to do given any combination
        of orthography profile and rules or not orthography profile
        or rules.

        Parameters
        ----------
        string : str
            The input string to be tokenized.

        column : str (default = "graphemes")
            The column label for the transformation, if specified.

        form : None or unicode normalization form
            Normalize return value if form is not None.

        ipa : bool
            Tokenize IPA (work in progress)

        Returns
        -------
        result : str
            Result of the tokenization.

        """
        res = []
        for word in string.split():
            if ipa:
                res.append(
                    self.combine_modifiers(self.grapheme_clusters(nfd(word))))
            else:
                if self.op:
                    res.append(
                        self.transform(word,
                                       column=column,
                                       error=self._errors[errors]))
                else:
                    res.append(self.grapheme_clusters(nfd(word)))

        def pp(word):
            res = segment_separator.join(word).strip()
            res = self._rules.apply(res) if self._rules else res
            return unicodedata.normalize(form, res) if form else res

        return separator.join(pp(word) for word in res)
Exemplo n.º 2
0
 def characters(
     self,
     string,
     segment_separator=' ',
     separator=' # ',
 ):
     """
     Given a string as input, return a space-delimited string of Unicode characters
     (code points rendered as glyphs).
     Parameters
     ----------
     string : str
         A Unicode string to be tokenized into graphemes.
     Returns
     -------
     result : str
         String returned is space-delimited on Unicode characters and contains "#" to
         mark word boundaries.
         The string is in NFD.
     Notes
     -----
     Input is first normalized according to Normalization Ford D(ecomposition).
     String returned contains "#" to mark word boundaries.
     """
     return separator.join(
         segment_separator.join(word) for word in nfd(string).split())