示例#1
0
    def prosody_static(self, audio, plots):
        """Extract the static prosody features from an audio file

        :param audio: .wav audio file.
        :param plots: timeshift to extract the features
        :returns: array with the 103 prosody features

        >>> prosody=Prosody()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features=prosody.prosody_static(file_audio, plots=True)

        """
        fs, data_audio=read(audio)
        data_audio=data_audio-np.mean(data_audio)
        data_audio=data_audio/float(np.max(np.abs(data_audio)))
        size_frameS=self.size_frame*float(fs)
        size_stepS=self.step*float(fs)
        thr_len_pause=self.thr_len*float(fs)
        overlap=size_stepS/size_frameS
        nF=int((len(data_audio)/size_frameS/overlap))-1

        if self.pitch_method == 'praat':
            name_audio=audio.split('/')
            temp_uuid='prosody'+name_audio[-1][0:-4]
            if not os.path.exists(self.PATH+'/../tempfiles/'):
                os.makedirs(self.PATH+'/../tempfiles/')
            temp_filename_f0=self.PATH+'/../tempfiles/tempF0'+temp_uuid+'.txt'
            temp_filename_vuv=self.PATH+'/../tempfiles/tempVUV'+temp_uuid+'.txt'
            praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0)

            F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),self.step)
            os.remove(temp_filename_f0)
            os.remove(temp_filename_vuv)
        elif self.pitch_method == 'rapt':
            data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32)
            F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0')

        segmentsV=V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS)
        segmentsUP=V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS)

        segmentsP=[]
        segmentsU=[]
        for k in range(len(segmentsUP)):
            eu=logEnergy(segmentsUP[k])
            if (len(segmentsUP[k])>thr_len_pause):
                segmentsP.append(segmentsUP[k])
            else:
                segmentsU.append(segmentsUP[k])

        F0_features=F0feat(F0)
        energy_featuresV=energy_feat(segmentsV, fs, size_frameS, size_stepS)
        energy_featuresU=energy_feat(segmentsU, fs, size_frameS, size_stepS)
        duration_features=duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs)

        if plots:
            self.plot_pros(data_audio,fs,F0,segmentsV, segmentsU, F0_features)

        features=np.hstack((F0_features, energy_featuresV, energy_featuresU, duration_features))
        
        return features
示例#2
0
def prosody_static(audio, flag_plots):

    fs, data_audio = read(audio)

    data_audio = data_audio[:-1, 0]
    print(len(data_audio))
    print(data_audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = 0.02 * float(fs)
    size_stepS = 0.01 * float(fs)
    thr_len_pause = 0.14 * float(fs)
    thr_en_pause = 0.2
    overlap = size_stepS / size_frameS
    nF = int((len(data_audio) / size_frameS / overlap)) - 1
    data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
    print(data_audiof)
    print(size_stepS)
    F0 = pysptk.sptk.rapt(data_audiof,
                          fs,
                          int(size_stepS),
                          min=60,
                          max=350,
                          voice_bias=-0.2,
                          otype='f0')

    logE = []
    for l in range(nF):
        data_frame = data_audio[int(l * size_stepS):int(l * size_stepS +
                                                        size_frameS)]
        logE.append(logEnergy(data_frame))
    logE = np.asarray(logE)
    print("see")
    print(np.unique(F0))
    segmentsV = V_UV(F0,
                     data_audio,
                     fs,
                     type_seg="Voiced",
                     size_stepS=size_stepS)
    segmentsU = V_UV(F0,
                     data_audio,
                     fs,
                     type_seg="Unvoiced",
                     size_stepS=size_stepS)

    Nvoiced = len(segmentsV)
    Nunvoiced = len(segmentsU)

    Vrate = fs * float(Nvoiced) / len(data_audio)

    avgdurv = 1000 * np.mean([len(segmentsV[k])
                              for k in range(Nvoiced)]) / float(fs)
    stddurv = 1000 * np.std([len(segmentsV[k])
                             for k in range(Nvoiced)]) / float(fs)

    silence = []
    for k in range(Nunvoiced):
        eu = logEnergy(segmentsU[k])
        if (eu < thr_en_pause or len(segmentsU[k]) > thr_len_pause):
            silence.append(segmentsU[k])
    print("here")
    print(eu)
    Silrate = fs * float(len(silence)) / len(data_audio)

    avgdurs = 1000 * np.mean([len(silence[k])
                              for k in range(len(silence))]) / float(fs)
    stddurs = 1000 * np.std([len(silence[k])
                             for k in range(len(silence))]) / float(fs)

    if flag_plots:
        plt.figure(1)
        plt.subplot(311)
        t = np.arange(0, float(len(data_audio)) / fs, 1.0 / fs)
        if len(t) != len(data_audio):
            t = np.arange(1.0 / fs, float(len(data_audio)) / fs, 1.0 / fs)
        print(len(t), len(data_audio))
        plt.plot(t, data_audio, 'k')
        plt.ylabel('Amplitude')
        plt.xlabel('Time (s)')
        plt.xlim([0, t[-1]])
        plt.grid(True)
        plt.subplot(312)
        fsp = len(F0) / t[-1]
        print(fsp)
        t2 = np.arange(0.0, t[-1], 1.0 / fsp)
        if len(t2) > len(F0):
            t2 = t2[:len(F0)]
        elif len(F0) > len(t2):
            F0 = F0[:len(t2)]
        plt.plot(t2, F0, color='k', linewidth=2.0)
        plt.xlabel('Time (s)')
        plt.ylabel('F0 (Hz)')
        plt.ylim([0, np.max(F0) + 10])
        plt.xlim([0, t[-1]])
        plt.grid(True)
        plt.subplot(313)
        fse = len(logE) / t[-1]
        t3 = np.arange(0.0, t[-1], 1.0 / fse)
        if len(t3) > len(logE):
            t3 = t3[:len(logE)]
        elif len(logE) > len(t3):
            logE = logE[:len(t3)]
        plt.plot(t3, logE, color='k', linewidth=2.0)
        plt.xlabel('Time (s)')
        plt.ylabel('Energy (dB)')
        #plt.ylim([0,np.max(logE)])
        plt.xlim([0, t[-1]])
        plt.grid(True)
        plt.show()

    F0std = np.std(F0[F0 != 0])
    F0varsemi = Hz2semitones(F0std**2)

    return F0, logE, np.mean(F0[F0 != 0]), np.std(
        F0[F0 != 0]), np.max(F0), np.mean(logE), np.std(logE), np.max(
            logE
        ), Vrate, avgdurv, stddurv, Silrate, avgdurs, stddurs, F0varsemi
示例#3
0
def prosody_static(audio, flag_plots, pitch_method='praat'):

    fs, data_audio = read(audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = 0.02 * float(fs)
    size_stepS = 0.01 * float(fs)
    thr_len_pause = 0.14 * float(fs)
    thr_en_pause = 10 * np.log10(0.02)
    overlap = size_stepS / size_frameS
    nF = int((len(data_audio) / size_frameS / overlap)) - 1

    if pitch_method == 'praat':
        temp_uuid = audio.split('/')[-1][0:-4]
        temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt'
        temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt'
        praat_functions.praat_vuv(audio,
                                  temp_filename_f0,
                                  temp_filename_vuv,
                                  time_stepF0=0.01,
                                  minf0=60,
                                  maxf0=350)

        F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                         len(data_audio) / float(fs), 0.01)
        os.remove(temp_filename_f0)

    elif pitch_method == 'rapt':
        data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
        F0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(size_stepS),
                              min=60,
                              max=350,
                              voice_bias=-0.2,
                              otype='f0')

    segmentsV = V_UV(F0,
                     data_audio,
                     fs,
                     type_seg="Voiced",
                     size_stepS=size_stepS)
    segmentsUP = V_UV(F0,
                      data_audio,
                      fs,
                      type_seg="Unvoiced",
                      size_stepS=size_stepS)

    segmentsP = []
    segmentsU = []
    for k in range(len(segmentsUP)):
        eu = logEnergy(segmentsUP[k])
        if (len(segmentsUP[k]) > thr_len_pause):
            segmentsP.append(segmentsUP[k])
        else:
            segmentsU.append(segmentsUP[k])

    F0_features = F0feat(F0)
    energy_featuresV = energy_feat(segmentsV, fs, size_frameS, size_stepS)
    energy_featuresU = energy_feat(segmentsU, fs, size_frameS, size_stepS)

    duration_features = duration_feat(segmentsV, segmentsU, segmentsP,
                                      data_audio, fs)

    if flag_plots:

        plot_pros(data_audio, fs, F0, segmentsV, segmentsU)

    features = np.hstack(
        (F0_features, energy_featuresV, energy_featuresU, duration_features))
    return features