示例#1
0
    def test_code_switching(self):
        dictdir  = os.path.join(RESOURCES_PATH, "vocab")
        vocabfra = os.path.join(dictdir, "fra.vocab")
        vocabcmn = os.path.join(dictdir, "cmn.vocab")

        wds = WordsList(vocabfra)
        wds.load_from_ascii( vocabcmn )
        self.assertEquals( wds.get_size(), 458002)
示例#2
0
 def test_all(self):
     l = WordsList( VOCAB )
     self.assertEqual(l.get_size(), 20 )
     self.assertTrue( l.is_unk('toto') )
     self.assertFalse( l.is_unk('normale') )
     self.assertFalse( l.is_unk("isn't") )
     self.assertFalse( l.is_unk(u"đ") )
     l.add(u"être")
     self.assertTrue( l.is_in(u"être") )
     self.assertTrue( l.is_unk("être") )
示例#3
0
    def test_code_switching(self):
        dictdir  = os.path.join(RESOURCES_PATH, "vocab")
        vocabfra = os.path.join(dictdir, "fra.vocab")
        vocabcmn = os.path.join(dictdir, "cmn.vocab")

        wds = WordsList(vocabfra)
        wds.load_from_ascii( vocabcmn )
        self.assertEquals( wds.get_size(), 434333)

        self.tok.set_vocab( wds )
        splitswitch = self.tok.tokenize(u'et il m\'a dit : "《干脆就把那部蒙人的闲法给废了拉倒!》RT @laoshipukong : 27日"')
        self.assertEqual(splitswitch, u"et il m' a dit 干脆 就 把 那 部 蒙 人 的 闲 法 给 废 了 拉倒 rt @ laoshipukong 二十七 日")
示例#4
0
    def __init__(self, filename=None):
        """
        Constructor.
        Add events to the list: laughter, dummy, noise, silence.

        @param filename (str) is the phoneset file name, i.e. a file with 1 column.

        """
        WordsList.__init__(self, filename, nodump=True, casesensitive=True)
        self.add("@@")
        self.add("dummy")
        self.add("gb")
        self.add("sil")
示例#5
0
    def __init__(self, resourcefile, logfile=None):
        """
        Create a new sppasRepetition instance.

        @param resourcefile is either the lemma dictionary or the list of stop-words.

        Attention: the extention of the resource file name is very important:
        must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)!

        """

        # Members
        self._merge         = False  # Merge input in the output
        self._use_lemmatize = True   # Lemmatize the input
        self._use_stopwords = True   # Add specific stopwords of the input
        self._empan  = 5             # Detection length (nb of IPUs; 1=current IPU)
        self._alpha  = 0.5           # Specific stop-words threshold coefficient
        self.logfile = logfile
        self.lemmatizer = None
        self.stopwords  = None

        # Create the lemmatizer instance
        try:
            lemmafile = resourcefile.replace(".stp", ".lem")
            self.lemmatizer = LemmaDict(lemmafile)
        except Exception:
            self._use_lemmatize = False

        if (self._use_lemmatize is True and self.lemmatizer.get_size() == 0) or self._use_lemmatize is False:
            if logfile is not None:
                logfile.print_message("Lemmatization disabled.",indent=2,status=3)
            else:
                print " ... ... [ INFO ] Lemmatization disabled."
            self._use_lemmatize = False

        # Create the list of stop words (list of non-relevant words)
        try:
            stopfile = resourcefile.replace(".lem", ".stp")
            self.stopwords = WordsList(filename=resourcefile, nodump=True)
            if self.stopwords.get_size() == 0:
                self._use_stopwords = False
        except Exception:
            self.stopwords = WordsList()

        #if (self._use_stopwords is True and self.stopwords.get_size() == 0) or self._use_stopwords is False:
        if self._use_stopwords is False:
            if logfile is not None:
                logfile.print_message("StopWords disabled.",indent=2,status=3)
            else:
                print " ... ... [ INFO ] StopWords disabled."
示例#6
0
 def test_save(self):
     l = WordsList( VOCAB )
     l.save( VOCAB2 )
     l2 = WordsList( VOCAB2 )
     self.assertEqual(l.get_size(), l2.get_size())
     for w in l.get_list():
         self.assertTrue(l2.is_in(w))
示例#7
0
 def test_save(self):
     l = WordsList( VOCAB, nodump=True )
     l.save( VOCAB_TEST )
     l2 = WordsList( VOCAB_TEST, nodump=True )
     self.assertEqual(l.get_size(), l2.get_size())
     for w in l.get_list():
         self.assertTrue(l2.is_in(w))
示例#8
0
    def testVocab(self):
        wds = WordsList()
        wds.add("a")
        wds.add("b")
        wds.add("c")
        ngramcounter = NgramCounter(1,wds)
        ngramcounter.count( self.corpusfile )

        self.assertEqual(ngramcounter.get_count('a'), 15)
        self.assertEqual(ngramcounter.get_count('b'), 10)
        self.assertEqual(ngramcounter.get_count('c'), 4)
        self.assertEqual(ngramcounter.get_count('d'), 0)
        self.assertEqual(ngramcounter.get_count(UNKSTAMP), 3)
        self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
        self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
示例#9
0
    def __init__(self, vocab=None, lang="und"):
        """
        Create a new DictTok instance.

        @param vocab (WordsList)
        @param lang is the language code in iso639-3.

        """
        # resources
        self.dicoutf = DictReplUTF8()
        self.repl    = DictRepl(None)
        self.punct   = WordsList(None)
        self.vocab   = vocab
        self.speech  = True   # transcribed speech (and not written text) is to be tokenized 
        if vocab is None:
            self.vocab = WordsList(None)

        # members
        self.lang = lang
        self.num2letter = sppasNum( lang )
        self.delimiter = u' '
示例#10
0
class DictTok:
    """
    @authors: Brigitte Bigi, Tatsuya Watanabe
    @contact: [email protected]
    @license: GPL, v3
    @summary: Tokenization automatic annotation.

    The creation of text corpora requires a sequence of processing steps in
    order to constitute, normalize, and then to directly exploit it by a given
    application. This class implements a generic approach for text normalization
    and concentrates on the aspects of methodology and linguistic engineering,
    which serve to develop a multi-purpose multilingual text corpus.
    This approach consists in splitting the text normalization problem in a set
    of minor sub-problems as language-independent as possible.

    From the manual Enriched Orthographic Transcription, two derived ortho.
    transcriptions are generated automatically by the tokenizer: the "standard"
    transcription (the list of orthographic tokens); the "faked spelling" that
    is a specific transcription from which the obtained phonetic tokens are
    used by the phonetization system.

    The following illustrates an utterance text normalization in French:

    - Transcription:   j'ai on a j'ai p- (en)fin j'ai trouvé l(e) meilleur moyen c'était d(e) [loger,locher] chez  des amis
    (English translation is: I've we've I've - well I found the best way was to live in friends' apartment')

    - Resulting Standard tokens:  j' ai on a j' ai p- enfin j' ai trouvé le meilleur moyen c'était de loger  chez  des amis
    - Resulting Faked tokens:     j' ai on a j' ai p-   fin j' ai trouvé l  meilleur moyen c'était d  loche  chez  des amis

    See the whole description of the algorithm in the following reference:
        Brigitte Bigi (2011).
        A Multilingual Text Normalization Approach.
        2nd Less-Resourced Languages workshop,
        5th Language & Technology Conference, Poznan (Poland).

    """

    # ------------------------------------------------------------------


    def __init__(self, vocab=None, lang="und"):
        """
        Create a new DictTok instance.

        @param vocab (WordsList)
        @param lang is the language code in iso639-3.

        """
        # resources
        self.dicoutf = DictReplUTF8()
        self.repl    = DictRepl(None)
        self.punct   = WordsList(None)
        self.vocab   = vocab
        self.speech  = True   # transcribed speech (and not written text) is to be tokenized 
        if vocab is None:
            self.vocab = WordsList(None)

        # members
        self.lang = lang
        self.num2letter = sppasNum( lang )
        self.delimiter = u' '

    # End __init__
    # ------------------------------------------------------------------


    # ------------------------------------------------------------------
    # Options
    # ------------------------------------------------------------------

    def set_delim(self, delim):
        """
        Set the delimiter, used to separate tokens.

        @param delim is a unicode character.

        """
        self.delimiter = delim

    # End set_delim
    # -------------------------------------------------------------------------


    def set_vocab(self,vocab):
        """
        Set the lexicon.

        @param vocab is a WordsList().

        """
        self.vocab = vocab

    # -------------------------------------------------------------------------


    def set_repl(self,repl):
        """
        Set the dictionary of replacements.

        @param repl (ReplDict)

        """
        self.repl = repl

    # -------------------------------------------------------------------------


    def set_punct(self,punct):
        """
        Set the list of punctuation.

        @param punct (WordsList)

        """
        self.punct = punct

    # -------------------------------------------------------------------------


    def set_lang(self,lang):
        """
        Set the language.

        @param lang is the language code in iso639-3 (fra, eng, vie, cmn...).

        """
        self.lang = lang

    # -------------------------------------------------------------------------


    # -------------------------------------------------------------------------
    # Language independent modules
    # -------------------------------------------------------------------------

    def split_characters(self,utt):
        """
        Split an utterance by characters.

        @param utt is the utterance (a transcription, a sentence, ...) in utf-8
        @return A string (split character by character, using spaces)

        """
        try:
            y = unicode(utt, 'utf-8')
        except Exception:
            y = utt
        tmp =  " ".join( y )

        # split all characters except numbers and ascii characters
        sstr = re.sub(u"([0-90-9a-zA-ZA-T\s]+\.?[0-90-9a-zA-ZA-T\s]+)", lambda o: u" %s " % o.group(0).replace(" ",""), tmp)
        # and dates... 
        if not self.speech:
            sstr = re.sub(u"([0-90-9\s]+\.?[月年日\s]+)", lambda o: u" %s " % o.group(0).replace(" ",""), sstr)
        # and ・
        sstr = re.sub(u'[\s]*・[\s]*', u"・", sstr)
        return sstr


    def split(self, utt, std=False):
        """
        Split an utterance using spaces or split each character, depending
        on the language.

        @param utt (string): the utterance (a transcription, a sentence, ...)
        @param std (Boolean)

        @return A list (array of string)

        """

        s = utt
        if self.lang == "cmn" or self.lang == "jpn" or self.lang == "yue":
            s = self.split_characters( s )

        toks = s.split()
        s = ""
        for t in toks:
            if not "/" in t: #if not a phonetized entry
                if std is False:
                    if self.lang != "cmn" and self.lang != "jpn" and self.lang != "yue":
                        # Split numbers if sticked to characters
                        # attention: do not replace [a-zA-Z] by [\w] (because \w includes numbers)
                        # and not on asian languages: it can be a tone!
                        s = re.sub(u'([0-9])([a-zA-Z])', ur'\1 \2', s)
                        s = re.sub(u'([a-zA-Z])([0-9])', ur'\1 \2', s)

                # Split some punctuation
                s = re.sub(u'\\[\\]', ur'\\] \\[', s)

                # Split dots if sticked to a word
                s = re.sub(u' \.([\w-])', ur'. \1', s)
                s = re.sub(u'^\.([\w-])', ur'. \1', s)

        s = " ".join(toks)

        # Then split each time there is a space and return result
        s = rutils.ToStrip( s )

        return s.split()

    # End split
    # ------------------------------------------------------------------


    def __stick_longest(self, utt, attachement = "_"):
        """ Longest matching algorithm. """
        tabtoks = utt.split(" ")
        i = len(tabtoks)
        while i>0:
            # try to stick all tokens
            _token = attachement.join(tabtoks)
            if self.vocab.is_unk(_token) is False:
                return (i,_token)
            tabtoks.pop()
            i -= 1
        return (1,utt.split(" ")[0])


    def stick(self, utt, attachement = "_"):
        """
        Stick tokens of an utterance using '_'.
        Language independent.

        @param utt (list) the utterance (a transcription, a sentence, ...)
        @return A list of strings

        """
        _utt = []
        t1 = 0
        while t1<len(utt):
            sl = utt[t1] # longest string ... in theory!
            lmax = t1+7
            if lmax>len(utt):
                lmax = len(utt)
            for t2 in range(t1+1,lmax):
                sl = sl + " " + utt[t2]
            (i,tok) = self.__stick_longest( rutils.ToStrip( sl ), attachement) # real longest string!
            t1 += i
            _utt.append( rutils.ToStrip( tok ) )

        return _utt

    # End stick
    # ------------------------------------------------------------------


    def replace(self, utt):
        """
        Examine tokens and performs some replacements.
        A dictionary with symbols contains the replacements to operate.

        This method also includes language specific replacements.
        Supported languages are: fra, cmn, jpn, yue, eng, ita, spa, khm, cat, pol.

        @param utt (list) the utterance

        @return A list of strings

        """
        # Specific case of float numbers
        sent = ' '.join(utt)
        sent = re.sub(u'([0-9])\.([0-9])', ur'\1 NUMBER_SEP_POINT \2', sent)
        sent = re.sub(u'([0-9])\,([0-9])', ur'\1 NUMBER_SEP \2', sent)
        sent = rutils.ToStrip( sent )
        _utt = sent.split()

        # Other generic replacements
        _result = []
        for s in _utt:
            if self.repl.is_key( s ):
                s = s.replace(s, self.repl.replace(s))
            _result.append(rutils.ToStrip( s ))

        return _result

    # End replace
    # -----------------------------------------------------------------------


    def compound(self, utt):
        """
        Examine tokens containing - or ' and split depending on rules.
        Language independent.

        @param utt (list) the utterance
        @return A list of strings

        """
        _utt = []
        for tok in utt:
            # a missing compound word?
            #   --> an unknown token
            #   --> containing a special character
            #   --> that is not a truncated word!
            if self.vocab.is_unk(tok.lower().strip()) is True and (tok.find("-")>-1 or tok.find("'")>-1 or tok.find(".")>-1) and not tok.endswith('-'):
                # Split the unknown token into a list
                # KEEP special chars ('-.) in the array!
                _tabtoks = re.split("([-'.])",tok)

                # Explore the list from left to right
                t1 = 0
                while t1<len(_tabtoks):
                    i = len(_tabtoks)
                    i_ok = 0
                    # Find the longest string in the dict
                    while i>=t1 and i_ok==0:
                        _token = _tabtoks[t1]
                        if i > (t1+1):
                            for j in range(t1+1,i):
                                _token += _tabtoks[j]
                            if self.vocab.is_unk(_token) is False:
                                i_ok = j+1
                        else:
                            i_ok = 1
                            _token = _tabtoks[t1]
                        i -= 1
                    t1 += i_ok
                    _utt.append( rutils.ToStrip( _token ))

            else:
                _utt.append( rutils.ToStrip( tok ))

        return _utt

    # End compound
    # ------------------------------------------------------------------


    def lower(self, utt ):
        """
        Lower a list of strings.

        @param utt (list)

        """
        _utt = []
        for tok in utt:
            if "/" not in tok:
                _utt.append( rutils.ToLower( tok ))
            else:
                _utt.append( tok )

        return _utt

    # End lower
    # ------------------------------------------------------------------


    def remove(self, utt, wlist):
        """
        Remove data of an utterance if included in a dictionary.
        Only used to remove punctuation.

        @param entry
        @param wlist (WordList)

        """

        _utt = []
        for tok in utt:
            if wlist.is_unk(tok) is True and "gpd_" not in tok and "ipu_" not in tok:
                _utt.append( tok )

        return _utt

    # End remove
    # ------------------------------------------------------------------


    # ------------------------------------------------------------------
    # EOT specific modules
    # ------------------------------------------------------------------

    def __repl(self, obj):
        """
        Callback for clean_toe.

        @param obj (MatchObject)
        @return string
        """
        # Left part
        # Remove parentheses
        left = obj.group(1).replace('(', '')
        left = left.replace(')', '')
        # Replace spaces with underscores
        left = "_".join(left.split())

        # Right part
        # Remove spaces
        right = obj.group(2)
        right = "".join(right.split())
        return " [%s,%s]" % (left, right)


    def clean_toe(self, entry):
        """
        Clean Enriched Orthographic Transcription.
        The convention includes information that must be removed.

        @param entry (string)
        @return string

        """
        # Proper names: $ name ,P\$
        entry = re.sub(u',\s?[PTS]+\s?[\\/\\\]+\s?\\$', ur'', entry, re.UNICODE)
        entry = re.sub(ur'\$', ur'', entry, re.UNICODE)

        entry = re.sub(u'(gpd_[0-9]+)', ur"\1 ", entry, re.UNICODE)
        entry = re.sub(u'(ipu_[0-9]+)', ur"\1 ", entry, re.UNICODE)

        # Remove invalid parenthesis content
        entry = re.sub(ur'\s+\([\w\xaa-\xff]+\)\s+', ' ', entry, re.UNICODE)
        entry = re.sub(ur'^\([\w\xaa-\xff]+\)\s+', ' ', entry, re.UNICODE)
        entry = re.sub(ur'\s+\([\w\xaa-\xff]+\)$', ' ', entry, re.UNICODE)

        entry = re.sub(ur'\s*\[([^,]+),([^,]+)\]', self.__repl, entry, re.UNICODE)
        return " ".join(entry.split())

    # End clean_toe and __repl
    # ------------------------------------------------------------------


    def toe_spelling(self, entry, std=False):
        """
        Create a specific spelling from an Enriched Orthographic Transcription.

        @param entry (string): the EOT string
        @return a string.

        DevNote: Python’s regular expression engine supports Unicode.
        It can apply the same pattern to either 8-bit (encoded) or
        Unicode strings. To create a regular expression pattern that
        uses Unicode character classes for \w (and \s, and \b), use
        the “(?u)” flag prefix, or the re.UNICODE flag.
        """
        # Ensure all regexp will work!
        _fentry = " " + unicode(entry) + " "

        if std is False:
            # Stick unregular Liaisons to the previous token
            _fentry = re.sub(u' =([\w]+)=', ur'-\1', _fentry, re.UNICODE)
        else:
            # Remove Liaisons
            _fentry = re.sub(u' =([\w]+)=', ur' ', _fentry, re.UNICODE)

        # Laughing sequences
        _fentry = re.sub(u"\s?@\s?@\s?", u" ", _fentry, re.UNICODE)

        # Laughing
        _fentry = re.sub(u"([\w\xaa-\xff]+)@", ur"\1 @", _fentry, re.UNICODE)
        _fentry = re.sub(u"@([\w\xaa-\xff]+)", ur"@ \1", _fentry, re.UNICODE)

        # Noises
        _fentry = re.sub(u"([\w\xaa-\xff]+)\*", ur"\1 *", _fentry, re.UNICODE)
        _fentry = re.sub(u"\*([\w\xaa-\xff]+)", ur"* \1", _fentry, re.UNICODE)

        # Transcriptor comment's: {comment}
        _fentry = re.sub(u'\\{[\s\w\xaa-\xff\-:]+\\}', ur'', _fentry, re.UNICODE)
        # Transcriptor comment's: [comment]
        _fentry = re.sub(u'\\[[\s\w\xaa-\xff\-:]+\\]', ur'', _fentry, re.UNICODE)
        # Transcription comment's: (comment)
        _fentry = re.sub(u' \\([\s\w\xaa-\xff\-:]+\\) ', ur'', _fentry, re.UNICODE) # .... warning!

        if std is False:
            # Special elisions (remove parenthesis content)
            _fentry = re.sub(u'\\([\s\w\xaa-\xff\-\']+\\)', ur'', _fentry, re.UNICODE)
        else:
            # Special elisions (keep parenthesis content)
            _fentry = re.sub(u'\\(([\s\w\xaa-\xff\-]+)\\)', ur'\1', _fentry, re.UNICODE)

        # Morphological variants are ignored for phonetization (same pronunciation!)
        _fentry = re.sub(u'\s+\\<([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\>', ur' \1', _fentry, re.UNICODE)
        _fentry = re.sub(u'\s+\\{([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\}', ur' \1', _fentry, re.UNICODE)
        _fentry = re.sub(u'\s+\\/([\-\'\s\w0-9\xaa-\xff]+),[\-\'\s\w0-9\xaa-\xff]+\\/', ur' \1', _fentry, re.UNICODE)

        if std is False:
            # Special pronunciations (keep right part)
            _fentry = re.sub(u'\s+\\[([\s\w\xaa-\xff/-]+),([\s\w\xaa-\xff/]+)\\]', ur' \2', _fentry, re.UNICODE)
        else:
            # Special pronunciations (keep left part)
            _fentry = re.sub(u'\s+\\[([\s\w\xaa-\xff\\/-]+),[\s\w\xaa-\xff\\/]+\\]', ur' \1', _fentry, re.UNICODE)

        # Proper names: $ name ,P\$
        _fentry = re.sub(u',\s?[PTS]+\s?[\\/\\\]+\s?\\$', ur'', _fentry, re.UNICODE)
        _fentry = re.sub(u'\\$', ur'', _fentry, re.UNICODE)

        # Add a space if some punctuation are sticked to a word
        # TODO: do the same with the whole list of punctuations (in rutils).
#        _fentry = re.sub(u'([:+^@}\(\){~|=]+)([\xaa-\xff]+)', ur'\1 \2', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+),', ur'\1 ,', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+)\+', ur'\1 +', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+);', ur'\1 ,', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+):', ur'\1 :', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+)\(', ur'\1 (', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+)\)', ur'\1 )', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+)\{', ur'\1 {', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+)\}', ur'\1 }', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+)=', ur'\1 =', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+)\?', ur'\1 ?', _fentry, re.UNICODE)
        _fentry = re.sub(u'([\w\xaa-\xff]+)\!', ur'\1 !', _fentry, re.UNICODE)
        #_fentry = re.sub(u'([\w\xaa-\xff]+)\/', ur'\1 !', _fentry, re.UNICODE) # no: if sampa in special pron.
        _fentry = re.sub(u"\s(?=,[0-9]+)", "" , _fentry, re.UNICODE)

        # Correction of errors
        s = ""
        inpron=False
        for c in _fentry:
            if c == "/":
                inpron = not inpron
            else:
                if c == " " and inpron is True:
                    continue
            s += c
        return rutils.ToStrip(s)

    # End toe_spelling
    # ------------------------------------------------------------------


    # ------------------------------------------------------------------
    # The main tokenize is HERE!
    # ------------------------------------------------------------------

    def tokenize_list(self, utt, std=False):
        """
        Tokenize from a list of entries.
        """
        # Step 2: replace
        try:
            utt = self.replace( utt )
        except IOError:
            # repl file not found
            pass
        except Exception as e:
            raise Exception(" *in replace* "+str(e)+'\n')

        # Step 3: compound
        try:
            utt = self.compound( utt )
        except Exception as e:
            raise Exception(" *in compound* "+str(e)+'\n')

        # Step 4: stick (using the dictionary)
        try:
            attachement = "_"
            if (self.lang=="cmn" or self.lang == "jpn" or self.lang == "yue"):
                attachement = ""
            utt = self.stick( utt,attachement )
        except Exception as e:
            raise Exception(" *in stick* "+str(e)+'\n')


        # Step 5: num2letter
        try:
            _utt = []
            for i in utt:
                if not "/" in utt:
                    _utt.append( self.num2letter.convert( i ) )
                else:
                    _utt.append( i )
            utt = _utt
        except Exception as e:
            pass

        # Step 6: lower
        try:
            utt = self.lower( utt )
        except Exception as e:
            raise Exception(" *in lower* "+str(e)+'\n')

        # Step 7: remove (punctuation)
        try:
            utt = self.remove( utt,self.punct )
        except Exception as e:
            raise Exception(" *in remove* "+str(e)+'\n')

        # Finally, prepare the result
        strres = ""
        for s in utt:
            s = rutils.ToStrip( s )
            strres = strres + u" " + s.replace(u" ",u"_")

        strres = rutils.ToStrip(strres)
        if len(strres)==0:
            return "#"   # or "dummy" ???

        return strres.replace(u" ", self.delimiter)



         
    def tokenize(self, entry, std=False):
        """
        Tokenize an utterrance.

        @param entry (UTF8-String) is the utterrance (the transcription)
        @param std (Boolean) In case of enriched transcription, std is used
        to fix the output as standard or faked spelling

        @return A string (the tokenized transcription)

        **TODO: disable TOE_CLEAN for written text**

        """

        # THE ENTRY (a transcription, a text...) IS A UTF8-STRING
        # -------------------------------------------------------
        _str = rutils.ToStrip( entry )

        # Remove UTF-8 specific characters that are not in our dictionaries!
        try:
            for key in self.dicoutf.get_keys():
                _str = _str.replace( key, self.dicoutf.replace(key) )
        except Exception as e:
            raise UnicodeError('Error during cleaning: %s'%str(e))

        # Enriched Orthographic Transcription
        # Create a faked spelling (default) or a standard spelling
        _str = self.clean_toe(_str)
        _str = self.toe_spelling(_str, std)

        # Step 1: split using spaces (or characters for asian languages)
        try:
            utt = self.split( _str, std )
        except Exception as e:
            raise Exception(" *in split* "+str(e))

        # THE ENTRY IS NOW A LIST OF STRINGS.
        # ---------------------------------------------------
        return self.tokenize_list(utt, std)
示例#11
0
class sppasRepetition( ):
    """
    SPPAS Automatic Repetition Detection
    (either self-repetitions or other-repetitions).

    This annotation is performed on the basis of aligned-tokens.
    The tokens can be lemmatized from a dictionary.

    The output is made of 2 tiers including intervals with
    sources and echos.

    How to use sppasRepetition?

    >>> p = sppasRepetition( dictpath, lang )
    >>> p.run(inputtrsname, outputfilename)

    """


    def __init__(self, resourcefile, logfile=None):
        """
        Create a new sppasRepetition instance.

        @param resourcefile is either the lemma dictionary or the list of stop-words.

        Attention: the extention of the resource file name is very important:
        must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)!

        """

        # Members
        self._merge         = False  # Merge input in the output
        self._use_lemmatize = True   # Lemmatize the input
        self._use_stopwords = True   # Add specific stopwords of the input
        self._empan  = 5             # Detection length (nb of IPUs; 1=current IPU)
        self._alpha  = 0.5           # Specific stop-words threshold coefficient
        self.logfile = logfile
        self.lemmatizer = None
        self.stopwords  = None

        # Create the lemmatizer instance
        try:
            lemmafile = resourcefile.replace(".stp", ".lem")
            self.lemmatizer = LemmaDict(lemmafile)
        except Exception:
            self._use_lemmatize = False

        if (self._use_lemmatize is True and self.lemmatizer.get_size() == 0) or self._use_lemmatize is False:
            if logfile is not None:
                logfile.print_message("Lemmatization disabled.",indent=2,status=3)
            else:
                print " ... ... [ INFO ] Lemmatization disabled."
            self._use_lemmatize = False

        # Create the list of stop words (list of non-relevant words)
        try:
            stopfile = resourcefile.replace(".lem", ".stp")
            self.stopwords = WordsList(filename=resourcefile, nodump=True)
            if self.stopwords.get_size() == 0:
                self._use_stopwords = False
        except Exception:
            self.stopwords = WordsList()

        #if (self._use_stopwords is True and self.stopwords.get_size() == 0) or self._use_stopwords is False:
        if self._use_stopwords is False:
            if logfile is not None:
                logfile.print_message("StopWords disabled.",indent=2,status=3)
            else:
                print " ... ... [ INFO ] StopWords disabled."
            #self._use_stopwords = False

    # End __init__
    # ------------------------------------------------------------------


    def fix_options(self, options):
        for opt in options:
            if "merge" == opt.get_key():
                self.set_merge( opt.get_value() )
            elif "stopwords" == opt.get_key():
                self.set_use_stopwords( opt.get_value() )
            elif "lemmatize" == opt.get_key():
                self.set_use_lemmatize( opt.get_value() )
            elif "empan" == opt.get_key():
                self.set_empan( opt.get_value() )
            elif "alpha" == opt.get_key():
                self.set_alpha( opt.get_value() )

    # End fix_options
    # ------------------------------------------------------------------


    # ###################################################################### #
    # Getters and Setters                                                    #
    # ###################################################################### #


    def set_merge(self, merge):
        """
        Fix the merge option.
        If merge is set to True, sppasRepetition() will save the input tiers
        in the output file.

        @param merge (Boolean)

        """
        self._merge = merge

    # End set_merge
    # ----------------------------------------------------------------------


    def set_use_lemmatize(self, use_lemmatize):
        """
        Fix the use_lemmatize option.

        If use_lemmatize is set to True, sppasRepetition() will lemmatize the
        input before the repetition automatic detection.

        @param use_lemmatize (Boolean)

        """
        self._use_lemmatize = use_lemmatize

    # End set_use_lemmatize
    # ----------------------------------------------------------------------


    def set_use_stopwords(self, use_stopwords):
        """
        Fix the use_stopwords option.

        If use_stopwords is set to True, sppasRepetition() will add specific
        stopwords to the stopwords list (deducted from the input text).

        @param use_stopwords (Boolean)

        """
        self._use_stopwords = use_stopwords

    # End set_use_stopwords
    # ----------------------------------------------------------------------


    def set_empan(self, empan):
        """
        Fix the empan option.

        @param empan (int)

        """
        self._empan = empan

    # End set_empan
    # ----------------------------------------------------------------------


    def set_alpha(self, alpha):
        """
        Fix the alpha option.

        @param alpha (int or float)

        """
        self._alpha = alpha

    # End set_alpha
    # ----------------------------------------------------------------------


    # ###################################################################### #
    # Automatic Detection search parameters                                  #
    # ###################################################################### #


    def lemmatize(self, inputtier):
        """
        Lemmatize a tier.

        @param inputtier (Tier)

        """
        if self._use_lemmatize is False:
            return

        lemmatier = inputtier.Copy()

        for i in range(lemmatier.GetSize()):
            lem = self.lemmatizer.get_lem( lemmatier[i].GetLabel().GetValue() )
            lemmatier[i].GetLabel().SetValue( lem )

        return lemmatier

    # ------------------------------------------------------------------


    def relevancy(self, inputtier):
        """
        Add very frequent tokens in a copy of the stopwords list.
        Return a WordsList instance

        Estimate the relevance of each term by using the number of
        occurrences of this term in the input and compare this value
        to a threshold, to add the term (or not) in the stopwords list.

        @param inputtier (Tier)

        """
        l = self.stopwords.copy()

        # Create the Unigram and put data
        u = Unigram()
        for a in inputtier:
            if a.GetLabel().IsSpeech() is True:
                u.add( a.GetLabel().GetValue() )

        # Estimate if a token is relevant, put in the stoplist
        for token in u.get_tokens():
            freq  = u.get_value(token)
            proba = float(freq) / float(u.get_sum())
            relevant = 1.0 / (float(u.get_size())*float(self._alpha))
            if proba > relevant:
                l.add( token )
                if self.logfile is not None:
                    self.logfile.print_message('Add in the stoplist: '+token, indent=3)
                elif DEBUG is True:
                    print(' ... ... ... Add in the stoplist: '+token.encode('utf8'))

        return l

    # End relevancy
    # ------------------------------------------------------------------


    def find_next_break (self, inputtier, start, empan):
        """
        Return the index of the next interval representing a break.
        This depends on the 'empan' value.

        @param start is the position of the token where the search will start

        """
        nbbreaks = 0
        for i in range (start, inputtier.GetSize()):
            if inputtier[i].GetLabel().IsSilence():
                nbbreaks = nbbreaks + 1
                if nbbreaks == empan:
                    return i
        return inputtier.GetSize() - 1

    # End find_next_break
    # ------------------------------------------------------------------


    # ###################################################################### #
    # Automatic Detection search                                             #
    # ###################################################################### #


    def _addrepetition(self, repeatobj, nbrepeats, inputtier1, inputtier2, tokstartsrc, tokstartrep, srctier, reptier):
        """
        Add sources and repetitions
        from repeatobj
        to the tiers.
        """

        n = 0
        for i in range(repeatobj.get_repeats_size()):

            # Source
            s,e = repeatobj.get_repeat_source(i)
            srcbegin = inputtier1[tokstartsrc+s].GetLocation().GetBegin()
            srcend   = inputtier1[tokstartsrc+e].GetLocation().GetEnd()
            time = TimeInterval(srcbegin.Copy(), srcend.Copy())
            srcann = Annotation(time, Label("S"+str(nbrepeats+n)))
            try:
                srctier.Add(srcann)
                if DEBUG:
                    print "src annotation added: ",srcann
            except Exception:
                continue

            # Repetition
            rep = repeatobj.get_repeat_repetition(i)
            for r in rep:
                (s,e) = r
                repbegin = inputtier2[tokstartrep+s].GetLocation().GetBegin()
                repend   = inputtier2[tokstartrep+e].GetLocation().GetEnd()
                r = reptier.Lindex(repbegin) #time)
                l = reptier.Rindex(repend) #time)

                # all other cases (no repetition, overlap)
                time = TimeInterval( repbegin.Copy(), repend.Copy() )
                repann = Annotation(time, Label("R"+str(nbrepeats+n)))
                reptier.Add(repann)
                if DEBUG:
                    print "rep annotation added: ",repann

            n = n + 1
        # end for

        return n



    def selfdetection(self, inputtier1):
        """
        Self-Repetition detection.

        @param inputtier1 (Tier)

        """
        # Verifications: is there any data?
        if inputtier1.IsEmpty() is True:
            raise Exception("Repetition. Empty input tokens tier.\n")

        # Update the stoplist
        if self._use_stopwords is True:
            stpw = self.relevancy( inputtier1 )
        else:
            stpw = self.stopwords

        # Create repeat objects
        repeatobj = Repetitions( )

        # Create output data
        srctier = Tier("Sources")
        reptier = Tier("Repetitions")
        nbrepeats = 1

        # Initialization of tokstart and tokend
        tokstart = 0
        if inputtier1[0].GetLabel().IsDummy():
            tokstart = 1
        toksearch = self.find_next_break( inputtier1, tokstart+1 , empan=1)
        tokend    = self.find_next_break( inputtier1, tokstart+1 , empan=self._empan)

        # Detection is here:
        while tokstart < tokend:

            # Build an array with the tokens
            tokens1 = list()
            for i in range(tokstart, tokend+1):
                tokens1.append( inputtier1[i].GetLabel().GetValue() )
            speaker1 = DataSpeaker( tokens1, stpw )

            # Detect repeats in these data
            repeatobj.detect( speaker1, toksearch-tokstart, None )

            # Save repeats
            if repeatobj.get_repeats_size()>0:
                n = self._addrepetition(repeatobj, nbrepeats, inputtier1, inputtier1, tokstart, tokstart, srctier, reptier)
                nbrepeats = nbrepeats + n

            # Prepare next search
            tokstart  = toksearch
            toksearch = self.find_next_break( inputtier1 , tokstart+1 , empan=1 )
            tokend    = self.find_next_break( inputtier1 , tokstart+1 , empan=self._empan )

        return (srctier,reptier)

    # End selfdetection
    # ------------------------------------------------------------------------


    def otherdetection(self, inputtier1, inputtier2):
        """
        Other-Repetition detection.

        @param inputtier (Tier)

        """
        # Verifications: is there any data?
        if inputtier1.IsEmpty() is True:
            raise Exception("Repetition. Empty input tokens tier.\n")

        # Update the stoplist
        if self._use_stopwords is True:
            # other-repetition: relevance of the echoing-speaker
            stpw = self.relevancy( inputtier2 )
        else:
            stpw = self.stopwords

        # Create repeat objects
        repeatobj = Repetitions( )

        # Create output data
        srctier = Tier("OR-Source")
        reptier = Tier("OR-Repetition")

        nbrepeats = 1

        # Initialization of tokstart, and tokend
        tokstartsrc = 0
        if inputtier1[0].GetLabel().IsDummy():
            tokstartsrc = 1
        tokendsrc = min(20, inputtier1.GetSize()-1)

        # Detection is here:
        # detect() is applied work by word, from tokstart to tokend
        while tokstartsrc < tokendsrc:

            # Build an array with the tokens
            tokens1 = list()
            for i in range(tokstartsrc, tokendsrc):
                tokens1.append( inputtier1[i].GetLabel().GetValue() )
            speaker1 = DataSpeaker( tokens1, stpw )

            # Create speaker2
            tokens2 = list()
            nbbreaks = 0
            tokstartrep = -1
            a = inputtier1[tokstartsrc]

            for (r,b) in enumerate(inputtier2):
                if b.GetLocation().GetBeginMidpoint() >= a.GetLocation().GetBeginMidpoint():
                    if tokstartrep == -1:
                        tokstartrep = r
                    if b.GetLabel().IsSilence():
                        nbbreaks = nbbreaks + 1
                    if nbbreaks == self._empan:
                        break
                    tokens2.append( b.GetLabel().GetValue() )
            speaker2 = DataSpeaker( tokens2, stpw )

            if DEBUG is True:
                print "SRC : ",speaker1
                print "ECHO: ",speaker2

            # Detect repeats in these data: search if the first token of spk1
            # is the first token of a source.
            repeatobj.detect( speaker1, 1, speaker2 )

            # Save repeats
            shift = 1
            if repeatobj.get_repeats_size()>0:
                if DEBUG is True:
                    print " ----> found : "
                    repeatobj.get_repeat(0).print_echo()
                s,e = repeatobj.get_repeat_source(0)
                n = self._addrepetition(repeatobj, nbrepeats, inputtier1, inputtier2, tokstartsrc, tokstartrep, srctier, reptier)
                if n > 0:
                    nbrepeats = nbrepeats + n
                shift = e + 1


            while speaker1.is_token(speaker1.get_token(shift)) is False and shift < 20:
                shift = shift + 1

            tokstartsrc = tokstartsrc + shift
            tokstartsrc = min(tokstartsrc, inputtier1.GetSize()-1)
            tokendsrc   = min(tokstartsrc + 20, inputtier1.GetSize()-1)

        return (srctier,reptier)

    # End otherdetection
    # ------------------------------------------------------------------------


    # ###################################################################### #
    # Run
    # ###################################################################### #


    def run(self, inputfilename1, inputfilename2=None, outputfilename=None):
        """
        Run the Repetition Automatic Detection annotation.

        @param inputfilename
        @param outputfilename

        """
        tokentier1 = None  # First tier
        tokentier2 = -1    # No echoing speaker
        try:
            # Find the token tier
            trsinput1 = annotationdata.io.read( inputfilename1 )
            for i in range( trsinput1.GetSize() ):
                if "token" in trsinput1[i].GetName().lower() and "align" in trsinput1[i].GetName().lower():
                    tokentier1 = i
                    break
            if inputfilename2 is not None:
                #find the token tier
                trsinput2 = annotationdata.io.read( inputfilename2 )
                for i in range( trsinput2.GetSize() ):
                    if "token" in trsinput2[i].GetName().lower() and "align" in trsinput2[i].GetName().lower():
                        tokentier2 = i
                        break
        except Exception as e:
            raise Exception('Repetitions. '+str(e))

        if tokentier1 is None:
            raise Exception('Repetitions. No valid input tier (expected: TokensAlign).')

        # Lemmatize input?
        if self._use_lemmatize is True and self.lemmatizer:
            tier1 = self.lemmatize( trsinput1[tokentier1] )
            if tokentier2 > -1:
                tier2 = self.lemmatize( trsinput2[tokentier2] )
        else:
            tier1 = trsinput1[tokentier1]
            if tokentier2 > -1:
                tier2 = trsinput2[tokentier2]

        if self.logfile is not None:
            self.logfile.print_message("Empan = "+str(self._empan), indent=3)
            self.logfile.print_message("Alpha = "+str(self._alpha), indent=3)

        # Repetition Automatic Detection
        if tokentier2 == -1:
            (srctier,reptier) = self.selfdetection( tier1 )
        else:
            (srctier,reptier) = self.otherdetection( tier1 , tier2 )

        # Manage results:
        # An output file name is given
        if outputfilename:
            trsoutput = Transcription("Repetitions")
            if self._merge is True:
                for i in range( trsinput1.GetSize() ):
                    trsoutput.Add( trsinput1[i] )
        # the repeat tier is added to the input transcription
        else:
            outputfilename = inputfilename1
            trsoutput = annotationdata.io.read( inputfilename1 )

        # Add repeats to this trsoutput
        trsoutput.Append( srctier )
        trsoutput.Append( reptier )

        trsoutput.SetMinTime( trsinput1.GetMinTime() )
        trsoutput.SetMaxTime( trsinput1.GetMaxTime() ) # hum, in case of OR... not sure! to be verified.

        # Save
        annotationdata.io.write( outputfilename, trsoutput )
示例#12
0
 def test_ita(self):
     l = WordsList( ITA )
     self.assertTrue( l.is_unk('toto') )
     self.assertFalse( l.is_unk(u'perché') )