Exemplo n.º 1
0
    def prosody_dynamic(self, audio):
        """Extract the dynamic prosody features from an audio file

        :param audio: .wav audio file.
        :returns: array (N,13) with the prosody features extracted from an audio file.  N= number of voiced segments

        >>> prosody=Prosody()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features=prosody.prosody_dynamic(file_audio)

        """
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.size_frame * float(fs)
        size_stepS = self.step * float(fs)
        overlap = size_stepS / size_frameS

        if self.pitch_method == 'praat':
            name_audio = audio.split('/')
            temp_uuid = 'prosody' + name_audio[-1][0:-4]
            if not os.path.exists(self.PATH + '/../tempfiles/'):
                os.makedirs(self.PATH + '/../tempfiles/')
            temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt'
            temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
            praat_functions.praat_vuv(audio,
                                      temp_filename_f0,
                                      temp_filename_vuv,
                                      time_stepF0=self.step,
                                      minf0=self.minf0,
                                      maxf0=self.maxf0)

            F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                             len(data_audio) / float(fs),
                                             self.step)
            os.remove(temp_filename_f0)
            os.remove(temp_filename_vuv)
        elif self.pitch_method == 'rapt':
            data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
            F0 = pysptk.sptk.rapt(data_audiof,
                                  fs,
                                  int(size_stepS),
                                  min=self.minf0,
                                  max=self.maxf0,
                                  voice_bias=self.voice_bias,
                                  otype='f0')

        #Find pitch contour of EACH voiced segment
        pitchON = np.where(F0 != 0)[0]
        dchange = np.diff(pitchON)
        change = np.where(dchange > 1)[0]
        iniV = pitchON[0]

        featvec = []
        iniVoiced = (pitchON[0] * size_stepS) + size_stepS  #To compute energy
        seg_voiced = []
        f0v = []
        Ev = []
        for indx in change:
            finV = pitchON[indx] + 1
            finVoiced = (pitchON[indx] *
                         size_stepS) + size_stepS  #To compute energy
            VoicedSeg = data_audio[int(iniVoiced):int(
                finVoiced)]  #To compute energy
            temp = F0[iniV:finV]
            tempvec = []
            if len(VoicedSeg) > int(
                    size_frameS):  #Take only segments greater than frame size
                seg_voiced.append(VoicedSeg)
                #Compute duration
                dur = len(VoicedSeg) / float(fs)
                #Pitch coefficients
                x = np.arange(0, len(temp))
                z = np.poly1d(np.polyfit(x, temp, self.P))
                f0v.append(temp)
                tempvec.extend(z.coeffs)
                #Energy coefficients
                temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap)
                Ev.append(temp)
                x = np.arange(0, len(temp))
                z = np.poly1d(np.polyfit(x, temp, self.P))
                tempvec.extend(z.coeffs)
                tempvec.append(dur)
                featvec.append(tempvec)
            iniV = pitchON[indx + 1]
            iniVoiced = (pitchON[indx + 1] *
                         size_stepS) + size_stepS  #To compute energy

        #Add the last voiced segment
        finV = (pitchON[len(pitchON) - 1])
        finVoiced = (pitchON[len(pitchON) - 1] *
                     size_stepS) + size_stepS  #To compute energy
        VoicedSeg = data_audio[int(iniVoiced):int(
            finVoiced)]  #To compute energy
        temp = F0[iniV:finV]
        tempvec = []

        if len(VoicedSeg) > int(
                size_frameS):  #Take only segments greater than frame size
            #Compute duration
            dur = len(VoicedSeg) / float(fs)
            tempvec.append(dur)
            x = np.arange(0, len(temp))
            z = np.poly1d(np.polyfit(x, temp, self.P))
            tempvec.extend(z.coeffs)
            #Energy coefficients
            temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap)
            x = np.arange(0, len(temp))
            z = np.poly1d(np.polyfit(x, temp, self.P))
            tempvec.extend(z.coeffs)
            #Compute duration
            featvec.append(tempvec)

        return np.asarray(featvec)
Exemplo n.º 2
0
def prosody_dynamic(audio,
                    size_frame=0.03,
                    size_step=0.01,
                    minf0=60,
                    maxf0=350,
                    voice_bias=-0.2,
                    energy_thr_percent=0.025,
                    P=5,
                    pitch_method='praat'):
    """
    Based on:
    Najim Dehak, "Modeling Prosodic Features With Joint Factor Analysis for Speaker Verification", 2007
    """
    fs, data_audio = read(audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = size_frame * float(fs)
    size_stepS = size_step * float(fs)
    overlap = size_stepS / size_frameS
    nF = int((len(data_audio) / size_frameS / overlap)) - 1
    data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
    if pitch_method == 'praat':
        name_audio = audio.split('/')
        temp_uuid = 'pros' + name_audio[-1][0:-4]
        temp_filename_vuv = path_app + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
        temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt'
        praat_functions.praat_vuv(audio,
                                  temp_filename_f0,
                                  temp_filename_vuv,
                                  time_stepF0=size_step,
                                  minf0=minf0,
                                  maxf0=maxf0)
        F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                         len(data_audio) / float(fs),
                                         size_step)
        os.remove(temp_filename_vuv)
        os.remove(temp_filename_f0)
    elif pitch_method == 'rapt':
        F0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(size_stepS),
                              min=minf0,
                              max=maxf0,
                              voice_bias=voice_bias,
                              otype='f0')

    #Find pitch contour of EACH voiced segment
    pitchON = np.where(F0 != 0)[0]
    dchange = np.diff(pitchON)
    change = np.where(dchange > 1)[0]
    iniV = pitchON[0]

    featvec = []
    iniVoiced = (pitchON[0] * size_stepS) + size_stepS  #To compute energy
    seg_voiced = []
    f0v = []
    Ev = []
    for indx in change:
        finV = pitchON[indx] + 1
        finVoiced = (pitchON[indx] *
                     size_stepS) + size_stepS  #To compute energy
        VoicedSeg = data_audio[int(iniVoiced):int(
            finVoiced)]  #To compute energy
        temp = F0[iniV:finV]
        tempvec = []
        if len(VoicedSeg) > int(
                size_frameS):  #Take only segments greater than frame size
            seg_voiced.append(VoicedSeg)
            #Compute duration
            dur = len(VoicedSeg) / float(fs)
            tempvec.append(dur)
            #Pitch coefficients
            x = np.arange(0, len(temp))
            z = np.poly1d(np.polyfit(x, temp, P))
            f0v.append(temp)
            #fitCoeff.append(z.coeffs)
            tempvec.extend(z.coeffs)
            #Energy coefficients
            temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap)
            Ev.append(temp)
            x = np.arange(0, len(temp))
            z = np.poly1d(np.polyfit(x, temp, P))
            tempvec.extend(z.coeffs)
            featvec.append(tempvec)
        iniV = pitchON[indx + 1]
        iniVoiced = (pitchON[indx + 1] *
                     size_stepS) + size_stepS  #To compute energy

    #Add the last voiced segment
    finV = (pitchON[len(pitchON) - 1])
    finVoiced = (pitchON[len(pitchON) - 1] *
                 size_stepS) + size_stepS  #To compute energy
    VoicedSeg = data_audio[int(iniVoiced):int(finVoiced)]  #To compute energy
    temp = F0[iniV:finV]
    tempvec = []
    if len(VoicedSeg) > int(
            size_frameS):  #Take only segments greater than frame size
        #Compute duration
        dur = len(VoicedSeg) / float(fs)
        tempvec.append(dur)
        x = np.arange(0, len(temp))
        z = np.poly1d(np.polyfit(x, temp, P))
        tempvec.extend(z.coeffs)
        #Energy coefficients
        temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap)
        x = np.arange(0, len(temp))
        z = np.poly1d(np.polyfit(x, temp, P))
        tempvec.extend(z.coeffs)
        #Compute duration
        featvec.append(tempvec)

    if flag_plots:
        plot_pros(data_audio, fs, F0, seg_voiced, Ev, featvec, f0v)

    return np.asarray(featvec)