Пример #1
    def write_audio_tracks(self, inputaudio, units, diralign, silence=0.):
        Write the first channel of an audio file into separated track files.
        Re-sample to 16000 Hz, 16 bits.

        @param inputaudio (src - IN) File name of the audio file.
        @param units     (list - IN) List of tuples (start-time,end-time) of tracks.
        @param diralign   (str - IN) Directory to write audio tracks.
        @param silence   float - IN) Duration of a silence to surround the tracks.

        channel = autils.extract_audio_channel( inputaudio,0 )
        channel = autils.format_channel( channel,16000,2 )

        for track,u in enumerate(units):
            (s,e) = u
            trackchannel = autils.extract_channel_fragment( channel, s, e, silence)
            trackname    = self._tracknames.audiofilename(diralign, track+1)
            autils.write_channel(trackname, trackchannel)
Пример #2
    def create_chunks(self, inputaudio, phontier, toktier, diralign):
        Create time-aligned tiers from raw intput tiers.

        @param phontier (Tier - IN) the tier with phonetization
        @param toktier  (Tier - IN) the tier with tokenization to split
        @param diralign (str - IN) the directory to work.

        trsoutput = Transcription("Chunks")

        # Extract the audio channel
        channel = autils.extract_audio_channel( inputaudio,0 )
        channel = autils.format_channel( channel,16000,2 )

        # Extract the lists of tokens and their corresponding pronunciations
        pronlist = self._tier2raw( phontier,map=True ).split()
        toklist  = self._tier2raw( toktier, map=False ).split()
        if len(pronlist) != len(toklist):
            raise IOError("Inconsistency between the number of items in phonetization %d and tokenization %d."%(len(pronlist),len(toklist)))

        # At a first stage, we'll find anchors.
        anchortier = AnchorTier()
        anchortier.set_duration( channel.get_duration() )

        # Search silences and use them as anchors.
        if self.SILENCES is True:
            anchortier.append_silences( channel )

        # Estimates the speaking rate (amount of tokens/sec. in average)
        self._spkrate.eval_from_duration( channel.get_duration(), len(toklist) )

        # Multi-pass ASR to find anchors
        A = -1      # number of anchors in the preceding pass
        N = self.N  # decreasing N-gram value
        W = self.W  # decreasing window length

        while A != anchortier.GetSize() and anchortier.check_holes_ntokens( self.NBT ) is False:

            anchortier.set_windelay( W )
            A = anchortier.GetSize()

            logging.debug(" =========================================================== ")
            logging.debug(" Number of anchors: %d"%A)
            logging.debug(" N: %d"%N)
            logging.debug(" W: %d"%W)

            # perform ASR and append new anchors in the anchor tier (if any)
            self._asr(toklist, pronlist, anchortier, channel, diralign, N)

            # append the anchor tier as intermediate result
            if self.ANCHORS is True and A != anchortier.GetSize():
                annotationdata.io.write( os.path.join(diralign,"ANCHORS-%d.xra"%anchortier.GetSize()),trsoutput )

            # prepare next pass
            W = max(W-1., self.WMIN)
            N = max(N-1,  self.NMIN)

        # Then, anchors are exported as tracks.
        tiert = anchortier.export(toklist)
        tierp = anchortier.export(pronlist)

        return trsoutput