示例#1
0
文件: ffnn.py 项目: henrypig/alex-1
class FFNNVADGeneral(object):
    """ This is implementation of a FFNN based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, model, filter_length, sample_rate, framesize,
                 frameshift, usehamming, preemcoef, numchans, ceplifter,
                 numceps, enormalise, zmeansource, usepower, usec0, usecmn,
                 usedelta, useacc, n_last_frames, n_prev_frames, lofreq,
                 hifreq, mel_banks_only):
        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(model)

        self.log_probs_speech = deque(maxlen=filter_length)
        self.log_probs_sil = deque(maxlen=filter_length)

        self.last_decision = 0.0

        self.front_end = MFCCFrontEnd(sample_rate, framesize, usehamming,
                                      preemcoef, numchans, ceplifter, numceps,
                                      enormalise, zmeansource, usepower, usec0,
                                      usecmn, usedelta, useacc,
                                      n_last_frames + n_prev_frames, lofreq,
                                      hifreq, mel_banks_only)

        self.framesize = framesize
        self.frameshift = frameshift

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(self.audio_recorded_in) > self.framesize:
            frame = self.audio_recorded_in[:self.framesize]
            self.audio_recorded_in = self.audio_recorded_in[self.frameshift:]

            mfcc = self.front_end.param(frame)

            prob_sil, prob_speech = self.ffnn.predict_normalise(
                mfcc.reshape(1, len(mfcc)))[0]

            # print prob_sil, prob_speech

            self.log_probs_speech.append(log(prob_speech))
            self.log_probs_sil.append(log(prob_sil))

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech,
                                                     self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp(
                    [log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            # print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision
示例#2
0
class FFNNVAD():
    """ This is implementation of a FFNN based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(self.cfg['VAD']['ffnn']['model'])

        self.log_probs_speech = deque(
            maxlen=self.cfg['VAD']['ffnn']['filter_length'])
        self.log_probs_sil = deque(
            maxlen=self.cfg['VAD']['ffnn']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'],
                self.cfg['VAD']['ffnn']['framesize'],
                self.cfg['VAD']['ffnn']['usehamming'],
                self.cfg['VAD']['ffnn']['preemcoef'],
                self.cfg['VAD']['ffnn']['numchans'],
                self.cfg['VAD']['ffnn']['ceplifter'],
                self.cfg['VAD']['ffnn']['numceps'],
                self.cfg['VAD']['ffnn']['enormalise'],
                self.cfg['VAD']['ffnn']['zmeansource'],
                self.cfg['VAD']['ffnn']['usepower'],
                self.cfg['VAD']['ffnn']['usec0'],
                self.cfg['VAD']['ffnn']['usecmn'],
                self.cfg['VAD']['ffnn']['usedelta'],
                self.cfg['VAD']['ffnn']['useacc'],
                self.cfg['VAD']['ffnn']['n_last_frames'] +
                self.cfg['VAD']['ffnn']['n_prev_frames'],
                self.cfg['VAD']['ffnn']['lofreq'],
                self.cfg['VAD']['ffnn']['hifreq'],
                self.cfg['VAD']['ffnn']['mel_banks_only'])
        else:
            raise ASRException('Unsupported frontend: %s' %
                               (self.cfg['VAD']['ffnn']['frontend'], ))

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(
                self.audio_recorded_in) > self.cfg['VAD']['ffnn']['framesize']:
            frame = self.audio_recorded_in[:self.
                                           cfg['VAD']['ffnn']['framesize']]
            self.audio_recorded_in = self.audio_recorded_in[
                self.cfg['VAD']['ffnn']['frameshift']:]

            mfcc = self.front_end.param(frame)

            prob_sil, prob_speech = self.ffnn.predict_normalise(
                mfcc.reshape(1, len(mfcc)))[0]

            # print prob_sil, prob_speech

            self.log_probs_speech.append(log(prob_speech))
            self.log_probs_sil.append(log(prob_sil))

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech,
                                                     self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp(
                    [log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            # print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision
示例#3
0
文件: ffnn.py 项目: UFAL-DSG/alex
class FFNNVADGeneral(object):
    """ This is implementation of a FFNN based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, model, filter_length, sample_rate, framesize, frameshift,
                 usehamming, preemcoef, numchans,  ceplifter, numceps,
                 enormalise, zmeansource, usepower, usec0, usecmn, usedelta,
                 useacc, n_last_frames, n_prev_frames, lofreq, hifreq,
                 mel_banks_only):
        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(model)

        self.log_probs_speech = deque(maxlen=filter_length)
        self.log_probs_sil = deque(maxlen=filter_length)

        self.last_decision = 0.0


        self.front_end = MFCCFrontEnd(
            sample_rate, framesize,
            usehamming, preemcoef,
            numchans, ceplifter,
            numceps, enormalise,
            zmeansource, usepower,
            usec0, usecmn,
            usedelta, useacc,
            n_last_frames + n_prev_frames,
            lofreq, hifreq,
            mel_banks_only)

        self.framesize = framesize
        self.frameshift = frameshift

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(self.audio_recorded_in) > self.framesize:
            frame = self.audio_recorded_in[:self.framesize]
            self.audio_recorded_in = self.audio_recorded_in[self.frameshift:]

            mfcc = self.front_end.param(frame)

            prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc.reshape(1,len(mfcc)))[0]

            # print prob_sil, prob_speech

            self.log_probs_speech.append(log(prob_speech))
            self.log_probs_sil.append(log(prob_sil))

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            # print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision
示例#4
0
文件: ffnn.py 项目: beka-evature/alex
class FFNNVAD():
    """ This is implementation of a FFNN based voice activity detector.

    It only implements decisions whether input frame is speech of non speech.
    It returns the posterior probability of speech for N last input frames.
    """
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(self.cfg['VAD']['ffnn']['model'])

        self.log_probs_speech = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length'])
        self.log_probs_sil = deque(maxlen=self.cfg['VAD']['ffnn']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'],
                self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'],
                self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'],
                self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'],
                self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'],
                self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'],
                self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'],
                self.cfg['VAD']['ffnn']['n_last_frames']+self.cfg['VAD']['ffnn']['n_prev_frames'],
                self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq'],
                self.cfg['VAD']['ffnn']['mel_banks_only'])
        else:
            raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], ))

    def decide(self, data):
        """Processes the input frame whether the input segment is speech or non speech.

        The returned values can be in range from 0.0 to 1.0.
        It returns 1.0 for 100% speech segment and 0.0 for 100% non speech segment.
        """

        data = struct.unpack('%dh' % (len(data) / 2, ), data)
        self.audio_recorded_in.extend(data)

        while len(self.audio_recorded_in) > self.cfg['VAD']['ffnn']['framesize']:
            frame = self.audio_recorded_in[:self.cfg['VAD']['ffnn']['framesize']]
            self.audio_recorded_in = self.audio_recorded_in[self.cfg['VAD']['ffnn']['frameshift']:]

            mfcc = self.front_end.param(frame)

            prob_sil, prob_speech = self.ffnn.predict_normalise(mfcc.reshape(1,len(mfcc)))[0]

            # print prob_sil, prob_speech

            self.log_probs_speech.append(log(prob_speech))
            self.log_probs_sil.append(log(prob_sil))

            log_prob_speech_avg = 0.0
            for log_prob_speech, log_prob_sil in zip(self.log_probs_speech, self.log_probs_sil):
                log_prob_speech_avg += log_prob_speech - logsumexp([log_prob_speech, log_prob_sil])
            log_prob_speech_avg /= len(self.log_probs_speech)

            prob_speech_avg = np.exp(log_prob_speech_avg)

            # print 'prob_speech_avg: %5.3f' % prob_speech_avg

            self.last_decision = prob_speech_avg

        # returns a speech / non-speech decisions
        return self.last_decision