Пример #1
0
class DictTok:
    """
    @authors: Brigitte Bigi, Tatsuya Watanabe
    @contact: [email protected]
    @license: GPL, v3
    @summary: Tokenization automatic annotation.

    The creation of text corpora requires a sequence of processing steps in
    order to constitute, normalize, and then to directly exploit it by a given
    application. This class implements a generic approach for text normalization
    and concentrates on the aspects of methodology and linguistic engineering,
    which serve to develop a multi-purpose multilingual text corpus.
    This approach consists in splitting the text normalization problem in a set
    of minor sub-problems as language-independent as possible.

    From the manual Enriched Orthographic Transcription, two derived ortho.
    transcriptions are generated automatically by the tokenizer: the "standard"
    transcription (the list of orthographic tokens); the "faked spelling" that
    is a specific transcription from which the obtained phonetic tokens are
    used by the phonetization system.

    The following illustrates an utterance text normalization in French:

    - Transcription:   j'ai on a j'ai p- (en)fin j'ai trouvé l(e) meilleur moyen c'était d(e) [loger,locher] chez  des amis
    (English translation is: I've we've I've - well I found the best way was to live in friends' apartment')

    - Resulting Standard tokens:  j' ai on a j' ai p- enfin j' ai trouvé le meilleur moyen c'était de loger  chez  des amis
    - Resulting Faked tokens:     j' ai on a j' ai p-   fin j' ai trouvé l  meilleur moyen c'était d  loche  chez  des amis

    See the whole description of the algorithm in the following reference:
        Brigitte Bigi (2011).
        A Multilingual Text Normalization Approach.
        2nd Less-Resourced Languages workshop,
        5th Language & Technology Conference, Poznan (Poland).

    """

    # ------------------------------------------------------------------

    def __init__(self, vocab=None, lang="und"):
        """
        Create a new DictTok instance.

        @param vocab (WordsList)
        @param lang is the language code in iso639-3.

        """
        # resources
        self.dicoutf = DictReplUTF8()
        self.repl = DictRepl(None)
        self.punct = WordsList(None)
        self.vocab = vocab
        self.speech = True  # transcribed speech (and not written text) is to be tokenized
        if vocab is None:
            self.vocab = WordsList(None)

        # members
        self.lang = lang
        self.num2letter = sppasNum(lang)
        self.delimiter = u" "

    # End __init__
    # ------------------------------------------------------------------

    # ------------------------------------------------------------------
    # Options
    # ------------------------------------------------------------------

    def set_delim(self, delim):
        """
        Set the delimiter, used to separate tokens.

        @param delim is a unicode character.

        """
        self.delimiter = delim

    # End set_delim
    # -------------------------------------------------------------------------

    def set_vocab(self, vocab):
        """
        Set the lexicon.

        @param vocab is a WordsList().

        """
        self.vocab = vocab

    # -------------------------------------------------------------------------

    def set_repl(self, repl):
        """
        Set the dictionary of replacements.

        @param repl (ReplDict)

        """
        self.repl = repl

    # -------------------------------------------------------------------------

    def set_punct(self, punct):
        """
        Set the list of punctuation.

        @param punct (WordsList)

        """
        self.punct = punct

    # -------------------------------------------------------------------------

    def set_lang(self, lang):
        """
        Set the language.

        @param lang is the language code in iso639-3 (fra, eng, vie, cmn...).

        """
        self.lang = lang

    # -------------------------------------------------------------------------

    # -------------------------------------------------------------------------
    # Language independent modules
    # -------------------------------------------------------------------------

    def split_characters(self, utt):
        """
        Split an utterance by characters.

        @param utt is the utterance (a transcription, a sentence, ...) in utf-8
        @return A string (split character by character, using spaces)

        """
        try:
            y = unicode(utt, "utf-8")
        except Exception:
            y = utt
        tmp = " ".join(y)

        # split all characters except numbers and ascii characters
        sstr = re.sub(
            u"([0-90-9a-zA-ZA-T\s]+\.?[0-90-9a-zA-ZA-T\s]+)", lambda o: u" %s " % o.group(0).replace(" ", ""), tmp
        )
        # and dates...
        if not self.speech:
            sstr = re.sub(u"([0-90-9\s]+\.?[月年日\s]+)", lambda o: u" %s " % o.group(0).replace(" ", ""), sstr)
        # and ・
        sstr = re.sub(u"[\s]*・[\s]*", u"・", sstr)
        return sstr

    def split(self, utt, std=False):
        """
        Split an utterance using spaces or split each character, depending
        on the language.

        @param utt (string): the utterance (a transcription, a sentence, ...)
        @param std (Boolean)

        @return A list (array of string)

        """

        s = utt
        if character_based(self.lang):
            s = self.split_characters(s)

        toks = []
        for t in s.split():
            # if not a phonetized entry
            if t.startswith("/") is False and t.endswith("/") is False:
                if std is False:
                    if not character_based(self.lang):
                        # Split numbers if sticked to characters
                        # attention: do not replace [a-zA-Z] by [\w] (because \w includes numbers)
                        # and not on Asian languages: it can be a tone!
                        t = re.sub(u"([0-9])([a-zA-Z])", ur"\1 \2", t)
                        t = re.sub(u"([a-zA-Z])([0-9])", ur"\1 \2", t)

                # Split some punctuation
                t = re.sub(u"\\[\\]", ur"\\] \\[", t)

                # Split dots if sticked to a word
                t = re.sub(u" \.([\w-])", ur". \1", t)
                t = re.sub(u"^\.([\w-])", ur". \1", t)

                # Split replacement characters
                for r in self.repl.get_keys():
                    if t.endswith(r):
                        t = t[: -len(r)]
                        t = t + " " + r
            toks.append(t.strip())

        s = " ".join(toks)

        # Then split each time there is a space and return result
        s = rutils.ToStrip(s)

        return s.split()

    # End split
    # ------------------------------------------------------------------

    def __stick_longest(self, utt, attachement="_"):
        """ Longest matching algorithm. """
        tabtoks = utt.split(" ")
        i = len(tabtoks)
        while i > 0:
            # try to stick all tokens
            _token = attachement.join(tabtoks)
            if self.vocab.is_unk(_token) is False:
                return (i, _token)
            tabtoks.pop()
            i -= 1
        return (1, utt.split(" ")[0])

    def stick(self, utt, attachement="_"):
        """
        Stick tokens of an utterance using '_'.
        Language independent.

        @param utt (list) the utterance (a transcription, a sentence, ...)
        @return A list of strings

        """
        _utt = []
        t1 = 0
        while t1 < len(utt):
            sl = utt[t1]  # longest string ... in theory!
            lmax = t1 + 7
            if lmax > len(utt):
                lmax = len(utt)
            for t2 in range(t1 + 1, lmax):
                sl = sl + " " + utt[t2]
            (i, tok) = self.__stick_longest(rutils.ToStrip(sl), attachement)  # real longest string!
            t1 += i
            _utt.append(rutils.ToStrip(tok))

        return _utt

    # End stick
    # ------------------------------------------------------------------

    def replace(self, utt):
        """
        Examine tokens and performs some replacements.
        A dictionary with symbols contains the replacements to operate.

        This method also includes language specific replacements.
        Supported languages are: fra, cmn, jpn, yue, eng, ita, spa, khm, cat, pol.

        @param utt (list) the utterance

        @return A list of strings

        """
        # Specific case of float numbers
        sent = " ".join(utt)
        sent = re.sub(u"([0-9])\.([0-9])", ur"\1 NUMBER_SEP_POINT \2", sent)
        sent = re.sub(u"([0-9])\,([0-9])", ur"\1 NUMBER_SEP \2", sent)
        sent = rutils.ToStrip(sent)
        _utt = sent.split()

        # Other generic replacements
        _result = []
        for s in _utt:
            if self.repl.is_key(s):
                s = s.replace(s, self.repl.replace(s))
            _result.append(rutils.ToStrip(s))

        return _result

    # End replace
    # -----------------------------------------------------------------------

    def compound(self, utt):
        """
        Examine tokens containing - or ' and split depending on rules.
        Language independent.

        @param utt (list) the utterance
        @return A list of strings

        """

        _utt = []
        for tok in utt:
            # a missing compound word?
            #   --> an unknown token
            #   --> containing a special character
            #   --> that is not a truncated word!
            if (
                self.vocab.is_unk(tok.lower().strip()) is True
                and ("-" in tok or "'" in tok or "." in tok)
                and not tok.endswith("-")
            ):

                # Split the unknown token into a list
                # KEEP special chars ('-.) in the array!
                _tabtoks = re.split("([-'.])", tok)

                # Explore the list from left to right
                t1 = 0
                while t1 < len(_tabtoks):
                    i = len(_tabtoks)
                    i_ok = 0
                    # Find the longest string in the dict
                    while i >= t1 and i_ok == 0:
                        _token = _tabtoks[t1]
                        if i > (t1 + 1):
                            for j in range(t1 + 1, i):
                                _token += _tabtoks[j]
                            if self.vocab.is_unk(_token) is False:
                                i_ok = j + 1
                        else:
                            i_ok = 1
                            _token = _tabtoks[t1]
                        i -= 1
                    t1 += i_ok
                    t2 = rutils.ToStrip(_token)
                    if len(t2) > 0:
                        _utt.append(t2)

            else:
                _utt.append(rutils.ToStrip(tok))

        return _utt

    # End compound
    # ------------------------------------------------------------------

    def lower(self, utt):
        """
        Lower a list of strings.

        @param utt (list)

        """
        _utt = []
        for tok in utt:
            if "/" not in tok:
                _utt.append(rutils.ToLower(tok))
            else:
                _utt.append(tok)

        return _utt

    # End lower
    # ------------------------------------------------------------------

    def remove(self, utt, wlist):
        """
        Remove data of an utterance if included in a dictionary.
        Only used to remove punctuation.

        @param entry
        @param wlist (WordList)

        """

        _utt = []
        for tok in utt:
            if wlist.is_unk(tok) is True and "gpd_" not in tok and "ipu_" not in tok:
                _utt.append(tok)

        return _utt

    # End remove
    # ------------------------------------------------------------------

    # ------------------------------------------------------------------
    # EOT specific modules
    # ------------------------------------------------------------------

    def __repl(self, obj):
        """
        Callback for clean_toe.

        @param obj (MatchObject)
        @return string
        """
        # Left part
        # Remove parentheses
        left = obj.group(1).replace("(", "")
        left = left.replace(")", "")
        # Replace spaces with underscores
        left = "_".join(left.split())

        # Right part
        # Remove spaces
        right = obj.group(2)
        right = "".join(right.split())
        return " [%s,%s]" % (left, right)

    def clean_toe(self, entry):
        """
        Clean Enriched Orthographic Transcription.
        The convention includes information that must be removed.

        @param entry (string)
        @return string

        """
        # Proper names: $ name ,P\$
        entry = re.sub(u",\s?[PTS]+\s?[\\/\\\]+\s?\\$", ur"", entry, re.UNICODE)
        entry = re.sub(ur"\$", ur"", entry, re.UNICODE)

        entry = re.sub(u"(gpd_[0-9]+)", ur" ", entry, re.UNICODE)
        entry = re.sub(u"(gpf_[0-9]+)", ur" ", entry, re.UNICODE)
        entry = re.sub(u"(ipu_[0-9]+)", ur" ", entry, re.UNICODE)

        # Remove invalid parenthesis content
        entry = re.sub(ur"\s+\([\w\xaa-\xff]+\)\s+", " ", entry, re.UNICODE)
        entry = re.sub(ur"^\([\w\xaa-\xff]+\)\s+", " ", entry, re.UNICODE)
        entry = re.sub(ur"\s+\([\w\xaa-\xff]+\)$", " ", entry, re.UNICODE)

        entry = re.sub(ur"\s*\[([^,]+),([^,]+)\]", self.__repl, entry, re.UNICODE)
        return " ".join(entry.split())

    # End clean_toe and __repl
    # ------------------------------------------------------------------

    def toe_spelling(self, entry, std=False):
        """
        Create a specific spelling from an Enriched Orthographic Transcription.

        @param entry (string): the EOT string
        @return a string.

        DevNote: Python’s regular expression engine supports Unicode.
        It can apply the same pattern to either 8-bit (encoded) or
        Unicode strings. To create a regular expression pattern that
        uses Unicode character classes for \w (and \s, and \b), use
        the “(?u)” flag prefix, or the re.UNICODE flag.
        """
        # Ensure all regexp will work!
        _fentry = " " + unicode(entry) + " "

        if std is False:
            # Stick unregular Liaisons to the previous token
            _fentry = re.sub(u" =([\w]+)=", ur"-\1", _fentry, re.UNICODE)
        else:
            # Remove Liaisons
            _fentry = re.sub(u" =([\w]+)=", ur" ", _fentry, re.UNICODE)

        # Laughing sequences
        _fentry = re.sub(u"\s?@\s?@\s?", u" ", _fentry, re.UNICODE)

        # Laughing
        _fentry = re.sub(u"([\w\xaa-\xff]+)@", ur"\1 @", _fentry, re.UNICODE)
        _fentry = re.sub(u"@([\w\xaa-\xff]+)", ur"@ \1", _fentry, re.UNICODE)

        # Noises
        _fentry = re.sub(u"([\w\xaa-\xff]+)\*", ur"\1 *", _fentry, re.UNICODE)
        _fentry = re.sub(u"\*([\w\xaa-\xff]+)", ur"* \1", _fentry, re.UNICODE)

        # Transcriptor comment's: {comment}
        _fentry = re.sub(u"\\{[\s\w\xaa-\xff\-:]+\\}", ur"", _fentry, re.UNICODE)
        # Transcriptor comment's: [comment]
        _fentry = re.sub(u"\\[[\s\w\xaa-\xff\-:]+\\]", ur"", _fentry, re.UNICODE)
        # Transcription comment's: (comment)
        # _fentry = re.sub(u' \\([\s\w\xaa-\xff\-:]+\\) ', ur'', _fentry, re.UNICODE) # .... warning!

        if std is False:
            # Special elisions (remove parenthesis content)
            _fentry = re.sub(u"\\([\s\w\xaa-\xff\-']+\\)", ur"", _fentry, re.UNICODE)
        else:
            # Special elisions (keep parenthesis content)
            _fentry = re.sub(u"\\(([\s\w\xaa-\xff\-]+)\\)", ur"\1", _fentry, re.UNICODE)

        # Morphological variants are ignored for phonetization (same pronunciation!)
        _fentry = re.sub(u"\s+\\<([\-'\s\w\xaa-\xff]+),[\-'\s\w\xaa-\xff]+\\>", ur" \1", _fentry, re.UNICODE)
        _fentry = re.sub(u"\s+\\{([\-'\s\w\xaa-\xff]+),[\-'\s\w\xaa-\xff]+\\}", ur" \1", _fentry, re.UNICODE)
        _fentry = re.sub(u"\s+\\/([\-'\s\w0-9\xaa-\xff]+),[\-'\s\w0-9\xaa-\xff]+\\/", ur" \1", _fentry, re.UNICODE)

        if std is False:
            # Special pronunciations (keep right part)
            _fentry = re.sub(u"\s+\\[([\s\w\xaa-\xff/-]+),([\s\w\xaa-\xff/]+)\\]", ur" \2", _fentry, re.UNICODE)
        else:
            # Special pronunciations (keep left part)
            _fentry = re.sub(u"\s+\\[([\s\w\xaa-\xff\\/-]+),[\s\w\xaa-\xff\\/]+\\]", ur" \1", _fentry, re.UNICODE)

        # Proper names: $ name ,P\$
        _fentry = re.sub(u",\s?[PTS]+\s?[\\/\\\]+\s?\\$", ur"", _fentry, re.UNICODE)
        _fentry = re.sub(u"\\$", ur"", _fentry, re.UNICODE)

        # Add a space if some punctuation are sticked to a word
        # TODO: do the same with the whole list of punctuations (in rutils).
        #        _fentry = re.sub(u'([:+^@}\(\){~|=]+)([\xaa-\xff]+)', ur'\1 \2', _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+),", ur"\1 ,", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+)\+", ur"\1 +", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+);", ur"\1 ,", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+):", ur"\1 :", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+)\(", ur"\1 (", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+)\)", ur"\1 )", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+)\{", ur"\1 {", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+)\}", ur"\1 }", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+)=", ur"\1 =", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+)\?", ur"\1 ?", _fentry, re.UNICODE)
        _fentry = re.sub(u"([\w\xaa-\xff]+)\!", ur"\1 !", _fentry, re.UNICODE)
        # _fentry = re.sub(u'([\w\xaa-\xff]+)\/', ur'\1 !', _fentry, re.UNICODE) # no: if sampa in special pron.
        _fentry = re.sub(u"\s(?=,[0-9]+)", "", _fentry, re.UNICODE)

        # Correction of errors
        s = ""
        inpron = False
        for c in _fentry:
            if c == "/":
                inpron = not inpron
            else:
                if c == " " and inpron is True:
                    continue
            s += c
        return rutils.ToStrip(s)

    # End toe_spelling
    # ------------------------------------------------------------------

    # ------------------------------------------------------------------
    # The main tokenize is HERE!
    # ------------------------------------------------------------------

    def tokenize_list(self, utt, std=False):
        """
        Tokenize from a list of entries.
        """
        # Step 2: replace
        try:
            utt = self.replace(utt)
        except IOError:
            # repl file not found
            pass
        except Exception as e:
            raise Exception(" *in replace* " + str(e) + "\n")

        # Step 3: compound
        try:
            utt = self.compound(utt)
        except Exception as e:
            raise Exception(" *in compound* " + str(e) + "\n")

        # Step 4: stick (using the dictionary)
        try:
            attachement = "_"
            if character_based(self.lang):
                attachement = ""
            utt = self.stick(utt, attachement)
        except Exception as e:
            raise Exception(" *in stick* " + str(e) + "\n")

        # Step 5: num2letter
        try:
            _utt = []
            for i in utt:
                if not "/" in utt:
                    _utt.append(self.num2letter.convert(i))
                else:
                    _utt.append(i)
            utt = _utt
        except Exception as e:
            pass

        # Step 6: lower
        try:
            utt = self.lower(utt)
        except Exception as e:
            raise Exception(" *in lower* " + str(e) + "\n")

        # Step 7: remove (punctuation)
        try:
            utt = self.remove(utt, self.punct)
        except Exception as e:
            raise Exception(" *in remove* " + str(e) + "\n")

        # Finally, prepare the result
        strres = ""
        for s in utt:
            s = rutils.ToStrip(s)
            strres = strres + u" " + s.replace(u" ", u"_")

        strres = rutils.ToStrip(strres)
        if len(strres) == 0:
            return ""  # Nothing valid!

        return strres.replace(u" ", self.delimiter)

    def tokenize(self, entry, std=False):
        """
        Tokenize an utterrance.

        @param entry (UTF8-String) is the utterrance (the transcription)
        @param std (Boolean) In case of enriched transcription, std is used
        to fix the output as standard or faked spelling

        @return A string (the tokenized transcription)

        **TODO: disable TOE_CLEAN for written text**

        """

        # THE ENTRY (a transcription, a text...) IS A UTF8-STRING
        # -------------------------------------------------------
        _str = rutils.ToStrip(entry)

        # Remove UTF-8 specific characters that are not in our dictionaries!
        try:
            for key in self.dicoutf.get_keys():
                _str = _str.replace(key, self.dicoutf.replace(key))
        except Exception as e:
            raise UnicodeError("Error during cleaning: %s" % str(e))

        # Enriched Orthographic Transcription
        # Create a faked spelling (default) or a standard spelling
        _str = self.clean_toe(_str)
        _str = self.toe_spelling(_str, std)

        # Step 1: split using spaces (or characters for asian languages)
        try:
            utt = self.split(_str, std)
        except Exception as e:
            raise Exception(" *in split* " + str(e))

        # THE ENTRY IS NOW A LIST OF STRINGS.
        # ---------------------------------------------------
        return self.tokenize_list(utt, std)