def gen_alignment(self, phonetization, tokenization, phoneslist, phonesdur, outputalign=None): """ Write an alignment in an output file. @param phonetization (list) phonetization of each token @param tokenization (list) each token @param phoneslist (list) each phone @param phonesdur (int) the duration of each phone in centi-seconds @param outputalign (str) the output file name """ timeval = 0 alignments = [] for phon in phoneslist: tv1 = timeval tv2 = timeval + phonesdur - 1 alignments.append( (tv1, tv2, phon) ) timeval = tv2 + 1 if len(alignments) == 0: alignments = [(0, int(phonesdur), "")] if outputalign is not None: outputalign = outputalign + "." + self._outext alignio = AlignerIO() alignio.write_palign(phonetization, tokenization, alignments, outputalign) return alignments
def __init__(self, model): """ """ self._aligner = aligners.instantiate(model,"julius") self._aligner.set_infersp(False) self._aligner.set_outext("walign") # Map phoneme names from SAMPA model-specific mappingfilename = os.path.join( model, "monophones.repl") if os.path.isfile( mappingfilename ): try: self._mapping = Mapping( mappingfilename ) except Exception: self._mapping = Mapping() else: self._mapping = Mapping() self._alignerio = AlignerIO() self._spkrate = SpeakerRate() self._radius = 0.005 self.N = 5 # initial N-gram to search the anchors self.NMIN = 3 # minimum N-gram to search the anchors self.W = 6. # initial windows delay to search the anchors self.WMIN = 3. # minimum windows delay to search the anchors self.NBT = 12 # maximum number of tokens for chunks (and holes) self.ANCHORS = True # Append anchors into the result self.SILENCES = False # Search automatically silences before anchors
def __init__(self, name="NoName", mintime=0., maxtime=0.): """ Creates a new SegmentsIn instance. """ Transcription.__init__(self, name, mintime, maxtime) self.alignerio = AlignerIO() self._radius = 0.005 self._tracknames = TrackNamesGenerator()
class TestTracksAlign( unittest.TestCase ): def setUp(self): self._alignerio = AlignerIO() def test_matching(self): filename = os.path.join(DATA, "track_000000") wordalign = self._alignerio.read_aligned( filename )[1] self.assertEqual(len(wordalign), 21) ref = [u"好", u"感", u"啦 @", u"好似", u"梁", u"安", u"琪", u"咁", u"係", u"啦", u"係", u"我", u"以前", u"都", u"聽", u"過", u"佢", u"", u"節目", u"喀", u"覺得", u"佢", u"講", u"", u"非常之", u"膚", u"淺", u"及", u"幼稚", u"呀", u"哦", u"即", u"khut6", u"嘩", u"咁"] hyp = [(token,score) for (start,end,token,score) in wordalign] pattern = Patterns() pattern.set_ngram(3) m3 = pattern.ngram_matchings( ref,hyp ) # Search for the lowest value in ref: minr = min( [ v[0] for v in m3 ] ) maxr = max( [ v[0] for v in m3 ] ) newref = ref[minr:maxr+1] minh = min( [ v[1] for v in m3 ] ) maxh = max( [ v[1] for v in m3 ] ) newhyp = hyp[minh:maxh+1] pattern = Patterns() pattern.set_ngram(3) newm3 = pattern.ngram_alignments( newref,newhyp ) newm3 = [ (v[0]+minr,v[1]+minh) for v in newm3 ] self.assertEqual(newm3,[(5, 5), (6, 6), (7, 7), (17, 16), (18, 17), (19, 18)]) pattern = Patterns() pattern.set_score(0.6) pattern.set_ngram(1) m1 = pattern.ngram_alignments( newref,newhyp ) newm1 = [ (v[0]+minr,v[1]+minh) for v in m1 ] self.assertEqual(newm1,[(6, 6)])
def setUp(self): self._alignerio = AlignerIO()
class Chunks( object ): """ @author: Brigitte Bigi @organization: Laboratoire Parole et Langage, Aix-en-Provence, France @contact: [email protected] @license: GPL, v3 @copyright: Copyright (C) 2011-2016 Brigitte Bigi @summary: Write tokenized-phonetized segments from Tiers. """ def __init__(self, model): """ """ self._aligner = aligners.instantiate(model,"julius") self._aligner.set_infersp(False) self._aligner.set_outext("walign") # Map phoneme names from SAMPA model-specific mappingfilename = os.path.join( model, "monophones.repl") if os.path.isfile( mappingfilename ): try: self._mapping = Mapping( mappingfilename ) except Exception: self._mapping = Mapping() else: self._mapping = Mapping() self._alignerio = AlignerIO() self._spkrate = SpeakerRate() self._radius = 0.005 self.N = 5 # initial N-gram to search the anchors self.NMIN = 3 # minimum N-gram to search the anchors self.W = 6. # initial windows delay to search the anchors self.WMIN = 3. # minimum windows delay to search the anchors self.NBT = 12 # maximum number of tokens for chunks (and holes) self.ANCHORS = True # Append anchors into the result self.SILENCES = False # Search automatically silences before anchors # ------------------------------------------------------------------------ # Getters # ------------------------------------------------------------------------ def get_ngram_init(self): return self.N def get_ngram_min(self): return self.NMIN def get_windelay_init(self): return self.W def get_windelay_min(self): return self.WMIN def get_chunk_maxsize(self): return self.NBT def get_anchors(self): return self.ANCHORS def get_silences(self): return self.SILENCES # ------------------------------------------------------------------------ # Setters # ------------------------------------------------------------------------ def set_ngram_init(self, N): if N < 2 or N > 20: raise ValueError('Bad value for N-gram.') self.N = N def set_ngram_min(self, N): if N < 2 or N > 20: raise ValueError('Bad value for N-gram.') self.NMIN = N def set_windelay_init(self, W): if W < 1. and W > 60.: raise ValueError('Bad value for a window delay.') self.W = W def set_windelay_min(self, W): if W < 1. and W > 60.: raise ValueError('Bad value for a window delay.') self.WMIN = W def set_chunk_maxsize(self, T): if T < 2 and T > 200: raise ValueError('Bad value for a chunk size.') self.NBT = T def set_anchors(self, value): self.ANCHORS = bool(value) def set_silences(self, value): self.SILENCES = bool(value) # ------------------------------------------------------------------------ # Workers # ------------------------------------------------------------------------ def create_chunks(self, inputaudio, phontier, toktier, diralign): """ Create time-aligned tiers from raw intput tiers. @param phontier (Tier - IN) the tier with phonetization @param toktier (Tier - IN) the tier with tokenization to split @param diralign (str - IN) the directory to work. """ trsoutput = Transcription("Chunks") # Extract the audio channel channel = autils.extract_audio_channel( inputaudio,0 ) channel = autils.format_channel( channel,16000,2 ) # Extract the lists of tokens and their corresponding pronunciations pronlist = self._tier2raw( phontier,map=True ).split() toklist = self._tier2raw( toktier, map=False ).split() if len(pronlist) != len(toklist): raise IOError("Inconsistency between the number of items in phonetization %d and tokenization %d."%(len(pronlist),len(toklist))) # At a first stage, we'll find anchors. anchortier = AnchorTier() anchortier.set_duration( channel.get_duration() ) anchortier.set_extdelay(1.) anchortier.set_outdelay(0.5) # Search silences and use them as anchors. if self.SILENCES is True: anchortier.append_silences( channel ) # Estimates the speaking rate (amount of tokens/sec. in average) self._spkrate.eval_from_duration( channel.get_duration(), len(toklist) ) # Multi-pass ASR to find anchors A = -1 # number of anchors in the preceding pass N = self.N # decreasing N-gram value W = self.W # decreasing window length while A != anchortier.GetSize() and anchortier.check_holes_ntokens( self.NBT ) is False: anchortier.set_windelay( W ) A = anchortier.GetSize() logging.debug(" =========================================================== ") logging.debug(" Number of anchors: %d"%A) logging.debug(" N: %d"%N) logging.debug(" W: %d"%W) # perform ASR and append new anchors in the anchor tier (if any) self._asr(toklist, pronlist, anchortier, channel, diralign, N) # append the anchor tier as intermediate result if self.ANCHORS is True and A != anchortier.GetSize(): self._append_tier(anchortier,trsoutput) annotationdata.io.write( os.path.join(diralign,"ANCHORS-%d.xra"%anchortier.GetSize()),trsoutput ) # prepare next pass W = max(W-1., self.WMIN) N = max(N-1, self.NMIN) # Then, anchors are exported as tracks. tiert = anchortier.export(toklist) tiert.SetName("Chunks-Tokenized") tierp = anchortier.export(pronlist) tierp.SetName("Chunks-Phonetized") trsoutput.Append(tiert) trsoutput.Append(tierp) return trsoutput # ------------------------------------------------------------------------ # Private # ------------------------------------------------------------------------ def _append_tier(self, tier, trs): """ Append a copy of Tier in self. """ copytier = Tier("Anchors-"+str(tier.GetSize())) for a in tier: ac = a.Copy() try: copytier.Append( ac ) except Exception: logging.debug("Append of annotation in tier failed: %s"%ac) trs.Append(copytier) # ------------------------------------------------------------------------ def _asr(self, toklist, pronlist, anchortier, channel, diralign, N): """ Windowing on the audio to perform ASR and find anchors. """ # init fromtime = 0. totime = float(N) fails = 0 maxtime = channel.get_duration() # windowing logging.debug(" ... Windowing the audio file:") while fromtime < maxtime and fails < 3: try: # Fix exact time range of this window... (fromtime,totime) = anchortier.fix_window(fromtime) if totime > maxtime: totime = maxtime if fromtime >= totime: logging.debug('Stop windowing: %f %f.'%(fromtime,totime)) break # Fix token range of this window... (fromtoken,totoken) = self._fix_trans_interval(fromtime,totime,toklist,anchortier) if fromtoken >= len(toklist): break logging.debug(" ... ... window: ") logging.debug("... ... ... time from %.4f to %.4f."%(fromtime,totime)) logging.debug("... ... ... token from %d to %d."%(fromtoken,totoken)) logging.debug("... ... ... REF: %s"%(" ".join( toklist[fromtoken:totoken] ))) logging.debug("... ... ... HYP: ") # Fix anchors of this window anchors = self._fix_window_asr(fromtime, totime, fromtoken, totoken, channel, pronlist, toklist, diralign, N) if len(anchors) > 0: self._add_anchors( anchors, anchortier ) # Prepare next window fromtime = anchors[-1][1] fails = 0 else: fromtime = totime logging.debug("... ... ... No anchor found.") except Exception: # try your luck in the next window... import traceback logging.info("%s"%str(traceback.format_exc())) fromtime = totime fails = fails + 1 # ------------------------------------------------------------------------ def _fix_trans_interval(self, fromtime, totime, toklist, anchortier): """ Fix the window on the transcript. """ # previous and following anchors af = anchortier.near_indexed_anchor( fromtime, -1 ) at = anchortier.near_indexed_anchor( totime, 1 ) fexact = False wdelay = totime-fromtime ntokens = self._spkrate.ntokens(wdelay)+1 # an anchor is not too far away before if af is not None and af.GetLocation().GetEnd() >= (fromtime-wdelay): # we have an exact position in the token list fromtoken = af.GetLabel().GetTypedValue() + 1 fexact = True else: # we approximate with the speaking rate fromtoken = max(0, self._spkrate.ntokens( fromtime ) - ntokens) # an anchor is not too far away after if at is not None and at.GetLocation().GetBegin() <= (totime+wdelay): # we have an exact position in the token list totoken = at.GetLabel().GetTypedValue() - 1 else: # we approximate with the speaking rate if fexact is True: totoken = min(len(toklist), fromtoken + int(1.5*ntokens)) else: totoken = min(len(toklist), self._spkrate.ntokens( totime ) + ntokens) # ... if fromtoken >= totoken: fromtoken = max(0, fromtoken-1) totoken = min(len(toklist), fromtoken+2) return (fromtoken,totoken) # ------------------------------------------------------------------------ def _fix_window_asr(self, fromtime, totime, fromtoken, totoken, channel, pronlist, toklist, diralign, N): """ Fix asr result in a window. Return the list of anchors the ASR found in that window. """ # create audio file fnw = os.path.join(diralign, "asr") fna = os.path.join(diralign, "asr.wav") trackchannel = autils.extract_channel_fragment( channel, fromtime, totime, 0.2) autils.write_channel( fna, trackchannel ) # call the ASR engine to recognize tokens of this track self._aligner.set_phones( " ".join( pronlist[fromtoken:totoken] ) ) self._aligner.set_tokens( " ".join( toklist[fromtoken:totoken] ) ) self._aligner.run_alignment(fna, fnw, N) # get the tokens time-aligned by the ASR engine wordalign = self._alignerio.read_aligned(fnw)[1] # (starttime,endtime,label,score) wordalign = self._adjust_asr_result(wordalign, fromtime, 0.2) # ignore the last word: we can't know if the word is entire or was cut if len(wordalign) > 3: wordalign.pop() # list of tokens the ASR automatically time-aligned tman = [token for token in toklist[fromtoken:totoken]] # list of tokens manually transcribed tasr = [(token,score) for (start,end,token,score) in wordalign] # Find matching tokens: the anchors matchingslist = self._fix_matchings_list( tman,tasr,N ) anchors = [] for match in matchingslist: i = match[0] # ref hi = match[1] # hyp s = wordalign[hi][0] e = wordalign[hi][1] anchors.append( (s,e,fromtoken+i) ) return anchors # ------------------------------------------------------------------------ def _adjust_asr_result(self, wordsalign, fromtime, silence): """ Adapt wordsalign: remove <s> and </s> then adjust time values. """ # shift all time values of "silence" delta for i in range(len(wordsalign)): wordsalign[i][0] = wordsalign[i][0] - silence wordsalign[i][1] = wordsalign[i][1] - silence # remove the first <s> and the last </s> wordsalign.pop(0) wordsalign.pop() if len(wordsalign) == 0: raise IOError("The ASR failed to find tokens.") # the first token was recognized during the silence we prepended!!!! if wordsalign[0][1] < 0: wordsalign.pop(0) wordsalign[0][0] = 0. # the first token was starting during the silence we prepended. wordsalign[0][0] = max( 0. , wordsalign[0][0] ) # shift all local time values to the real time values in the audioinput for i in range(len(wordsalign)): wordsalign[i][0] = wordsalign[i][0] + fromtime wordsalign[i][1] = wordsalign[i][1] + fromtime logging.debug("... ... ... ... %s - %f"%(wordsalign[i][2],wordsalign[i][3])) return wordsalign # ------------------------------------------------------------------------ def _fix_matchings_list(self, ref, hyp, N ): """ Create the list of matches between ref and hyp. """ pattern = Patterns() pattern.set_ngram( N ) m3 = pattern.ngram_matchings( ref,hyp ) if len( m3 ) == 0: return [] # Search for the lowest/highest values in ref/hyp: listref = [ v[0] for v in m3 ] listhyp = [ v[1] for v in m3 ] minr = min( listref ) maxr = max( listref ) newref = ref[minr:maxr+1] minh = min( listhyp ) maxh = max( listhyp ) newhyp = hyp[minh:maxh+1] # Do some hypothesis were found several times in the reference? if len(listhyp) > len(list(set(listhyp))): pattern.set_ngram( N+2 ) newm3 = pattern.ngram_matchings( ref,hyp ) listref = [ v[0] for v in m3 ] listhyp = [ v[1] for v in m3 ] if len(listhyp) > len(list(set(listhyp))): newm3 = [] else: newm3 = m3 m1 = [] if len(hyp) < N: pattern = Patterns() pattern.set_score(0.9) pattern.set_ngram(1) pattern.set_gap(1) m1 = pattern.ngram_alignments( newref,newhyp ) return sorted(list(set(m1+newm3))) # ------------------------------------------------------------------------ def _add_anchors(self, anchorlist, anchortier): """ Add anchors in the anchor tier """ if len(anchorlist) == 0: return logging.debug('... ... ... Anchors:') for (s,e,i) in anchorlist: # provide overlaps with a previous anchor previ = anchortier.Near( s,-1 ) if previ != -1: prevann = anchortier[previ] if prevann.GetLocation().GetEnd().GetMidpoint() > s: if prevann.GetLabel().IsSilence(): prevann.GetLocation().GetEnd().SetMidpoint(s) if prevann.GetLocation().GetEnd() < prevann.GetLocation().GetBegin(): anchortier.Pop(previ) else: s = prevann.GetLocation().GetEnd().SetMidpoint(s) # provide overlaps with a following anchor nexti = anchortier.Near( e,1 ) if nexti != -1: nextann = anchortier[nexti] if nextann.GetLocation().GetBegin().GetMidpoint() < e: if nextann.GetLabel().IsSilence(): nextann.GetLocation().GetBegin().SetMidpoint(e) if nextann.GetLocation().GetEnd() < nextann.GetLocation().GetBegin(): anchortier.Pop(nexti) else: e = nextann.GetLocation().GetBegin().SetMidpoint(e) valid = True # previous index must be lesser p = anchortier.near_indexed_anchor( s,-1 ) if p is not None: pidx = p.GetLabel().GetTypedValue() if i <= pidx: valid = False else: # solve a small amount of issues... # duration between the previous and the one we want to add deltatime = s-p.GetLocation().GetEnd().GetMidpoint() if deltatime < 0.2: if (i-10) > pidx: valid=False # next index must be higher n = anchortier.near_indexed_anchor(e, 1) if n is not None and i >= n.GetLabel().GetTypedValue(): valid = False # add the anchor if valid is True: time = TimeInterval(TimePoint(s),TimePoint(e)) label = Label( Text(i,data_type="int") ) anchortier.Add( Annotation(time,label) ) logging.debug("... ... ... ... Add: %f %f %d"%(s,e,i)) else: logging.debug("... ... ... ... ... Ignore: %f %f %d"%(s,e,i)) # Then, fill the very evident holes anchortier.fill_evident_holes() # ------------------------------------------------------------------------ def _tier2raw( self,tier,map=False ): """ Return all interval contents into a single string. """ # Map phonemes from SAMPA to the expected ones. self._mapping.set_keepmiss( True ) self._mapping.set_reverse( True ) raw = "" for i,ann in enumerate(tier): if ann.GetLabel().IsEmpty() is True: logging.info("WARNING: Found an empty annotation label at index %d"%i) raw = raw + " sil" else: #if ann.GetLabel().IsSilence() is False: besttext = ann.GetLabel().GetValue() if map is True: besttext = self._mapping.map( besttext ) if UNKSTAMP in besttext: besttext = besttext.replace(UNKSTAMP, "sil") raw = raw + " " + besttext return ToStrip(raw)
class TracksReader( Transcription ): """ @author: Brigitte Bigi @organization: Laboratoire Parole et Langage, Aix-en-Provence, France @contact: [email protected] @license: GPL, v3 @copyright: Copyright (C) 2011-2016 Brigitte Bigi @summary: Read time-aligned segments, convert into Tier. """ def __init__(self, name="NoName", mintime=0., maxtime=0.): """ Creates a new SegmentsIn instance. """ Transcription.__init__(self, name, mintime, maxtime) self.alignerio = AlignerIO() self._radius = 0.005 self._tracknames = TrackNamesGenerator() # ------------------------------------------------------------------------ def set_tracksnames( self, tracknames ): self._tracknames = tracknames # ------------------------------------------------------------------------ def read(self, dirname, units): """ Read a set of alignment files and set as tiers. @param dirname (str - IN) the input directory containing a set of unit @param units (list - IN) List of units with start/end times """ # Verify if the directory exists if os.path.exists( dirname ) is False: raise IOError("Missing directory: " + dirname+".\n") # Create new tiers itemp = self.NewTier("PhonAlign") itemw = self.NewTier("TokensAlign") # Explore each unit to get alignments for track in range(len(units)): # Get real start and end time values of this unit. unitstart,unitend = units[track] basename = self._tracknames.alignfilename(dirname, track+1) (_phonannots,_wordannots) = self.alignerio.read_aligned(basename) # Append alignments in tiers self._append_tuples(itemp, _phonannots, unitstart, unitend) self._append_tuples(itemw, _wordannots, unitstart, unitend) # Adjust Radius if itemp.GetSize()>1: itemp[-1].GetLocation().SetEndRadius(0.) if itemw.GetSize()>1: itemw[-1].GetLocation().SetEndRadius(0.) try: self._hierarchy.addLink('TimeAlignment', itemp, itemw) except Exception: pass # ------------------------------------------------------------------------ def _append_tuples(self, tier, tdata, delta, unitend): """ Append a list of (start,end,text,score) into the tier. Shift start/end of a delta value and set the last end value. """ try: for i,t in enumerate(tdata): (loc_s,loc_e,lab,scr) = t loc_s += delta loc_e += delta if i==(len(tdata)-1): loc_e = unitend # prepare the code in case we'll find a solution with # alternatives phonetizations/tokenization.... #lab = [lab] #scr = [scr] #label = Label() #for l,s in zip(lab,scr): # label.AddValue(Text(l,s)) label = Label( Text(lab,scr) ) annotationw = Annotation(TimeInterval(TimePoint(loc_s,self._radius), TimePoint(loc_e,self._radius)), label) tier.Append(annotationw) except Exception: pass