예제 #1
0
    def __init__(self, prondict):
        """
        Constructor.

        @param prondict (dict) with a set of couples: token=key, phon=value.

        """
        self.prondict = prondict
        self.dagphon = DAGPhon(variants=4)
예제 #2
0
    def __init__(self, pdict, maptable=None):
        """
        Constructor.

        @param pdict (DictPron) is the pronunciations dictionary.
        @param maptable (Mapping) is a mapping table for phones.

        """
        self.set_dict( pdict )
        self.set_maptable( maptable )
        self._dagphon = DAGPhon()
예제 #3
0
class DictPhon:
    """
    @author:       Brigitte Bigi
    @organization: Laboratoire Parole et Langage, Aix-en-Provence, France
    @contact:      [email protected]
    @license:      GPL, v3
    @copyright:    Copyright (C) 2011-2016  Brigitte Bigi
    @summary:      Dictionary-based automatic phonetization.

    Grapheme-to-phoneme conversion is a complex task, for which a number of
    diverse solutions have been proposed. It is a structure prediction task;
    both the input and output are structured, consisting of sequences of
    letters and phonemes, respectively.

    The phonetization system is entirely designed to handle multiple
    languages and/or tasks with the same algorithms and the same tools.
    Only resources are language-specific, and the approach is based on the
    simplest resources as possible:
    This annotation is using a dictionary-based approach.

    The dictionary can contain words with a set of pronunciations (the
    canonical one, and optionally some common reductions, etc).
    In this approach, it is then assumed that most of the words of the speech
    transcription and their phonetic variants are mentioned in
    the pronunciation dictionary. If a word is missing, our system is based
    on the idea that given enough examples it should be possible to predict
    the pronunciation of unseen words purely by analogy.

    See the whole description in the following reference:

        > Brigitte Bigi (2013).
        > A phonetization approach for the forced-alignment task.
        > 3rd Less-Resourced Languages workshop,
        > 6th Language & Technology Conference, Poznan (Poland).

    DictPhon is using the following convention:
        - minus separate phones,
        - pipes separate pronunciation variants.

    """
    def __init__(self, pdict, maptable=None):
        """
        Constructor.

        @param pdict (DictPron) is the pronunciations dictionary.
        @param maptable (Mapping) is a mapping table for phones.

        """
        self.set_dict( pdict )
        self.set_maptable( maptable )
        self._dagphon = DAGPhon()

    # -----------------------------------------------------------------------

    def set_dict(self, pdict):
        """
        Set the dictionary.

        @param pdict (DictPron) The pronunciation dictionary.

        """
        if isinstance(pdict, DictPron) is False:
            raise TypeError('Expected a DictPron instance.')

        self._pdict = pdict
        self._phonunk = PhonUnk( self._pdict.get_dict() )

    # -----------------------------------------------------------------------

    def set_maptable(self, maptable):
        """
        Set the dictionary.

        @param pdict (DictPron) The pronunciation dictionary.

        """
        if maptable is not None:
            if isinstance(maptable, Mapping) is False:
                raise TypeError('Expected a Mapping instance.')
        else:
            maptable = Mapping()

        self._maptable = maptable
        self._maptable.set_keepmiss( False )

    # -----------------------------------------------------------------------

    def get_phon_entry(self, entry):
        """
        Return the phonetization of an entry.
        Unknown entries are not automatically phonetized.
        This is a pure dictionary-based method.

        @param `entry` (str) The token to phonetize.
        @return A string with the phonetization of `entry` or
        the unknown symbol.

        """
        entry = ToStrip(entry)

        # Specific strings... for the italian transcription...
        # For the participation at the CLIPS-Evalita 2011 campaign.
        if entry.startswith(u"<") is True and entry.endswith(u">") is True:
            entry = entry[1:-1]

        # No entry! Nothing to do.
        if len(entry) == 0:
            return ""

        # Specific strings used in the CID transcription...
        # CID is Corpus of Interactional Data, http://sldr.org/sldr000720
        if entry.startswith(u"gpd_") is True or entry.startswith(u"gpf_") is True:
            return ""

        # Specific strings used in SPPAS IPU segmentation...
        if entry.find(u"ipu_")>-1:
            return ""

        # Find entry in the dict as it is given
        _strphon = self._pdict.get_pron( entry )

        # OK, the entry is properly phonetized.
        if _strphon != self._pdict.unkstamp:
            return self._map_phonentry( _strphon )

        return self._pdict.unkstamp

    # -----------------------------------------------------------------------

    def get_phon_tokens(self, tokens, phonunk=True):
        """
        Return the phonetization of a list of tokens, with the status.
        Unknown entries are automatically phonetized if `phonunk` is set to True.

        @param `tokens` (list) is the list of tokens to phonetize.
        @param `phonunk` (bool) Phonetize unknown words (or not).
        @todo EOT is not fully supported.

        @return A list with the tuple (token, phon, status).

        """
        tab = []

        for entry in tokens:
            phon   = self._pdict.unkstamp
            status = OK_ID

            # Enriched Orthographic Transcription Convention:
            # entry can be already in SAMPA.
            if entry.startswith("/") is True and entry.endswith("/") is True:
                phon = entry.strip("/")
                # Must use SAMPA (including minus to separate phones)

            else:

                phon = self.get_phon_entry(entry)

                if phon == self._pdict.unkstamp:
                    status = ERROR_ID

                    # A missing compound word?
                    if "-" in entry or "'" in entry or "_" in entry:
                        _tabpron = [ self.get_phon_entry( w ) for w in re.split(u"[-'_]",entry) ]

                        # OK, finally the entry is in the dictionary?
                        if not self._pdict.unkstamp in _tabpron:
                            # ATTENTION: each part can have variants! must be decomposed.
                            self._dagphon.variants = 4
                            phon = ToStrip(self._dagphon.decompose(" ".join(_tabpron)))
                            status = WARNING_ID

                    if phon == self._pdict.unkstamp and phonunk is True:
                        try:
                            phon = self._phonunk.get_phon( entry )
                            status = WARNING_ID
                        except Exception:
                            pass

            tab.append( (entry,phon,status) )

        return tab

    # -----------------------------------------------------------------------

    def phonetize(self, utterance, phonunk=True, delimiter=" "):
        """
        Return the phonetization of an utterance.

        @param `utterance` (str) is the utterance to phonetize.
        @param `phonunk` (bool) Phonetize unknown words (or not).
        @param `delimiter` (char) The character to use as tokens separator in `utterance`.

        @return A string with the phonetization of `utterance`.

        """
        if len(delimiter) > 1:
            raise TypeError('Delimiter must be a character.')

        tab = self.get_phon_tokens( utterance.split(delimiter), phonunk )
        tabphon = [t[1] for t in tab]

        return delimiter.join( tabphon ).strip()

    # -----------------------------------------------------------------------
    # Private
    # -----------------------------------------------------------------------

    def _map_phonentry(self, phonentry):
        """
        Map phonemes of a phonetized entry.

        @param phonentry (str) Phonetization of an entry.

        """
        if self._maptable.is_empty() is True:
            return phonentry

        tab = [ self._map_variant(v) for v in phonentry.split("|") ]

        return "|".join( tab )

    # -----------------------------------------------------------------------

    def _map_variant(self, phonvariant):
        """
        Map phonemes of only one variant of a phonetized entry.

        @param phonvariant (str) One phonetization variant of an entry.

        """
        phones = self._map_split_variant(phonvariant)
        subs = []
        # Single phonemes
        for p in phones:
            mapped = self._maptable.map_entry(p)
            if len(mapped)>0:
                subs.append( p+"|"+mapped )
            else:
                subs.append( p )

        self._dagphon.variants = 0
        phon = ToStrip( self._dagphon.decompose(" ".join(subs)) )

        # Remove un-pronounced phonemes!!!
        # By convention, they are represented by an underscore in the
        # mapping table.
        tmp = []
        for p in phon.split('|'):
            r = [ x for x in p.split("-") if x != "_" ]
            tmp.append("-".join(r))

        return "|".join(set(tmp))

    # -----------------------------------------------------------------------

    def _map_split_variant(self, phonvariant):
        """
        Return a list of the longest phone sequences.

        """
        phones = phonvariant.split("-")
        if len(phones) == 1:
            return phones

        tab = []
        idx = 0
        maxidx = len(phones)

        while idx < maxidx:
            # Find the index of the longest phone sequence that can be mapped
            leftindex = self.__longestlr( phones[idx:maxidx] )
            # Append such a longest sequence in tab
            tab.append( "-".join(phones[idx:idx+leftindex]) )
            idx = idx + leftindex

        return tab

    # -----------------------------------------------------------------------

    def __longestlr(self, tabentry):
        """
        Select the longest map of an entry.

        """
        i = len(tabentry)
        while i > 0:
            # Find in the map table a substring from 0 to i
            entry = "-".join(tabentry[:i])
            if self._maptable.is_key( entry ):
                return i
            i = i - 1

        # Did not find any map for this entry! Return the shortest.
        return 1
예제 #4
0
class PhonUnk:
    """
    @author:       Brigitte Bigi
    @organization: Laboratoire Parole et Langage, Aix-en-Provence, France
    @contact:      [email protected]
    @license:      GPL, v3
    @copyright:    Copyright (C) 2011-2016  Brigitte Bigi
    @summary:      Perform a G2P conversion for unknown entries.

    Phonetization, also called grapheme-phoneme conversion, is the process of
    representing sounds with phonetic signs. This class implements a
    language-independent algorithm to phonetize unknown tokens.
    The algorithm is based on the idea that given enough examples it should be
    possible to predict the pronunciation of unseen tokens purely by analogy.

    At this stage, it consists in exploring the unknown token from left to
    right, then from right to left, and to find the longest strings in the
    dictionary. Since this algorithm uses the dictionary, the quality of
    such a phonetization strongly depends on this resource.

    For details, see the following reference:

        > Brigitte Bigi (2013).
        > A phonetization approach for the forced-alignment task,
        > 3rd Less-Resourced Languages workshop, 6th Language & Technology
        > Conference, Poznan (Poland).

    Example of use:

        >>> d = { 'a':'a|aa', 'b':'b', 'c':'c|cc', 'abb':'abb', 'bac':'bac' }
        >>> p = PhonUnk(d)

    """
    def __init__(self, prondict):
        """
        Constructor.

        @param prondict (dict) with a set of couples: token=key, phon=value.

        """
        self.prondict = prondict
        self.dagphon = DAGPhon(variants=4)

    # ------------------------------------------------------------------
    # Getters and Setters
    # ------------------------------------------------------------------

    def set_variants(self, v):
        """
        Fix the maximum number of variants.

        @param v (int) If v is set to 0, all variants will be returned.

        """
        if v<0 or v>20:
            raise ValueError('Unexpected value for the number of variants.')
        self.dagphon.variants = v

    # -----------------------------------------------------------------------

    def get_phon(self, entry):
        """
        Return the phonetization of an unknown entry.

        @param entry is the string to phonetize
        @return a string with the proposed phonetization
        @raise Exception if the word can NOT be phonetized

        """
        _str = rutils.ToStrip( entry )
        _str = rutils.ToLower( _str )
        if len(_str)>0 and _str[-1].isalnum() is False:
            _str = _str[:-1]
        if len(_str)>0 and _str[0].isalnum() is False:
            _str = _str[1:]
        if len(_str) == 0:
            return ""

        if len(entry) > LIMIT_SIZE:
            raise Exception('Unable to phonetize the unknown token (too long): '+entry)

        # Find all pronunciations of segments with a longest matching algo.
        _tabstr = re.split(u"[-'_\s]",_str)
        pronlr = ""
        pronrl = ""

        for s in _tabstr:
            plr = self.__recurslr(s)
            plr = plr.strip()
            if len(plr)>0:
                pronlr = pronlr + " " + plr

            prl = self.__recursrl(s)
            prl = prl.strip()
            if len(prl)>0:
                pronrl = pronrl + " " + prl

        pronlr = pronlr.strip()
        pronrl = pronrl.strip()

        # Create the output
        pron = ""
        if len(pronlr) > 0:
            if len(pronrl) > 0:
                pron = self.dagphon.decompose( pronlr, pronrl )
            else:
                pron = self.dagphon.decompose( pronlr )
        else:
            if len(pronrl) > 0:
                pron = self.dagphon.decompose( pronrl )

        if len(pron)>0:
            return pron

        raise Exception('Unable to phonetize the unknown token: '+entry)

    # -----------------------------------------------------------------------
    # Private
    # -----------------------------------------------------------------------

    def __longestlr(self, entry):
        """
        Select the longest phonetization of an entry, from the end.
        """
        i = len(entry)
        while (i>0):
            # Find in the dictionary a substring from 0 to i
            if self.prondict.has_key( entry[:i] ):
                # Return index for the longest string
                return i
            i = i-1

        # Did not find any pronunciation for this entry!
        return 0

    # -----------------------------------------------------------------------

    def __recurslr(self, entry):
        """
        Recursive method to find a phonetization of a supposed unknown entry.
        Return a string with the proposed phonetization.
        Spaces separate segments.
        """
        if len(entry) == 0:
            return ""

        # LEFT:
        # ###########
        # Find the index of the longest left string that can be phonetized
        leftindex = self.__longestlr(entry)
        # Nothing can be phonetized at the left part!
        if leftindex == 0:
            _phonleft = ""
            leftindex = 1
            left = ""
        else:
            # left is from the first to the leftindex character in str
            left = entry[:leftindex]
            # Phonetize
            if not self.prondict.has_key( left ):
                _phonleft = ""
            else:
                _phonleft = self.prondict.get( left )
            # The entire entry can be phonetized (nothing to do at right)
            if leftindex == len(entry):
                return _phonleft

        # RIGHT:
        # ###########
        right = entry[leftindex:len(entry)]
        if len(right) == 0:
            return _phonleft
        if self.prondict.has_key( right ):
            _phonright = self.prondict.get( right )
        else:
            # If right part of the entry is unknown...
            # Use recursivity to phonetize
            _phonright = self.__recurslr(right)

        if len(_phonleft)>0 and len(_phonright)>0:
            return _phonleft+" "+_phonright

        return _phonright

    # -----------------------------------------------------------------------

    def __longestrl(self, entry):
        """
        Select the longest phonetization of an entry, from the start.
        """
        i = 0
        while (i<len(entry)):
            # Find in the dictionary a substring from i to the entry-length
            if self.prondict.has_key( entry[i:] ):
                # Return index for the longest string
                return i
            i = i+1

        # Did not find any pronunciation for this entry!
        return len(entry)

    # -----------------------------------------------------------------------

    def __recursrl(self, enrty):
        """
        Recursive method to find a phonetization of a supposed unknown entry.
        Return a string with the proposed phonetization.
        Spaces separate segments.
        """
        if len(enrty) == 0:
            return ""

        # RIGHT:
        # ###########
        # Find the index of the longest right string that can be phonetized
        rightindex = self.__longestrl(enrty)
        # Nothing can be phonetized at the right part!
        if rightindex == len(enrty):
            _phonright = ""
            rightindex = len(enrty)-1
            right = ""
        else:
            # right is from the end to the rightindex character in str
            right = enrty[rightindex:]
            # Phonetize
            if not self.prondict.has_key( right ):
                _phonright = ""
            else:
                _phonright = self.prondict.get( right )
            # The entire entry can be phonetized (nothing to do at left)
            if rightindex == 0:
                return _phonright

        # LEFT:
        # ###########
        left = enrty[0:rightindex]
        if len(left) == 0:
            return _phonright
        if self.prondict.has_key( left ):
            _phonleft = self.prondict.get( left )
        else:
            # If left part of the entry is unknown...
            # Use recursivity to phonetize
            _phonleft = self.__recursrl( left )

        if len(_phonleft)>0 and len(_phonright)>0:
            return _phonleft+" "+_phonright

        return _phonleft