예제 #1
0
    def prosody_static(self, audio, plots):
        """Extract the static prosody features from an audio file

        :param audio: .wav audio file.
        :param plots: timeshift to extract the features
        :returns: array with the 103 prosody features

        >>> prosody=Prosody()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features=prosody.prosody_static(file_audio, plots=True)

        """
        fs, data_audio=read(audio)
        data_audio=data_audio-np.mean(data_audio)
        data_audio=data_audio/float(np.max(np.abs(data_audio)))
        size_frameS=self.size_frame*float(fs)
        size_stepS=self.step*float(fs)
        thr_len_pause=self.thr_len*float(fs)
        overlap=size_stepS/size_frameS
        nF=int((len(data_audio)/size_frameS/overlap))-1

        if self.pitch_method == 'praat':
            name_audio=audio.split('/')
            temp_uuid='prosody'+name_audio[-1][0:-4]
            if not os.path.exists(self.PATH+'/../tempfiles/'):
                os.makedirs(self.PATH+'/../tempfiles/')
            temp_filename_f0=self.PATH+'/../tempfiles/tempF0'+temp_uuid+'.txt'
            temp_filename_vuv=self.PATH+'/../tempfiles/tempVUV'+temp_uuid+'.txt'
            praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0)

            F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),self.step)
            os.remove(temp_filename_f0)
            os.remove(temp_filename_vuv)
        elif self.pitch_method == 'rapt':
            data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32)
            F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0')

        segmentsV=V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS)
        segmentsUP=V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS)

        segmentsP=[]
        segmentsU=[]
        for k in range(len(segmentsUP)):
            eu=logEnergy(segmentsUP[k])
            if (len(segmentsUP[k])>thr_len_pause):
                segmentsP.append(segmentsUP[k])
            else:
                segmentsU.append(segmentsUP[k])

        F0_features=F0feat(F0)
        energy_featuresV=energy_feat(segmentsV, fs, size_frameS, size_stepS)
        energy_featuresU=energy_feat(segmentsU, fs, size_frameS, size_stepS)
        duration_features=duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs)

        if plots:
            self.plot_pros(data_audio,fs,F0,segmentsV, segmentsU, F0_features)

        features=np.hstack((F0_features, energy_featuresV, energy_featuresU, duration_features))
        
        return features
예제 #2
0
def intonation_duration(audio,
                        size_step=0.01,
                        minf0=60,
                        maxf0=350,
                        stol=0.150,
                        flag_plots=False):
    fs, data_audio = read(audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))

    temp_filename_f0 = path_app + '/../tempfiles/pitchtemp.txt'
    temp_filename_vuv = path_app + '/../tempfiles/voicetemp.txt'

    praat_functions.praat_vuv(audio,
                              temp_filename_f0,
                              temp_filename_vuv,
                              time_stepF0=size_step,
                              minf0=minf0,
                              maxf0=maxf0,
                              path_praat_script=path_app + "/../praat")
    pitch_z, ttotal = praat_functions.decodeF0(temp_filename_f0,
                                               len(data_audio) / fs, size_step)

    #Slopes
    slopes = []
    #buffers for voiced and unvoiced segments
    vbuffer = []
    ubuffer = []
    #energy for total voiced and unvoiced segments
    venergy = []
    uenergy = []
    #arrays for time-storing
    voicedtimes = []
    unvoicedtimes = []
    silencetimes = []
    #flag for starting point voiced time and unvoiced time
    startvoicedflag = True
    startUNvoicedflag = True
    #flag to compare with last segment
    recordneighbor = True
    energydifflocalneighbors = []

    F0_rec = np.zeros(len(pitch_z))
    slopesE = []
    for i in range(0, len(pitch_z) - 1):
        #condition for voiced segment
        if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0:
            vbuffer.append(pitch_z[i])
            #voiced segment starting time
            if (startvoicedflag):
                t_start_venergy = ttotal[i]
                startvoicedflag = False
                frameF0start = i

            if len(ubuffer) != 0:
                samples = len(ubuffer)

                t = float(samples * size_step
                          )  #unvoiced time based on F0 Fs and actual samples
                #silence condition
                if t > stol:
                    silencetimes.append(t)
                else:
                    unvoicedtimes.append(t)

                #clear the mess
                ubuffer = []
                #final time for unvoiced
                t_end_uenergy = ttotal[i]
                startUNvoicedflag = True
                #calculate segments with obtained times
                n_start_unvoiced = fs * t_start_uenergy
                n_end_unvoiced = fs * t_end_uenergy
                #energy of real audio segment based on fs and timestamp from F0
                #store
                uenergy.append(
                    logEnergy(
                        data_audio[int(n_start_unvoiced):int(n_end_unvoiced)]))
        #start appending unvoiced segments
        else:
            if (len(vbuffer) != 0):
                #based on F0 Fs and in buffer length, actual time is calculated
                samples = len(vbuffer)
                t = float(samples * size_step)
                #pick up voiced times
                voicedtimes.append(t)
                #voiced segment slope process
                #temporal x axis vector for slope calculation
                xtemp_slope = []
                tempslope = np.array(vbuffer)
                for j in range(0, len(vbuffer)):
                    xtemp_slope.append(j)
                #get slopes of voiced segments

                if len(xtemp_slope) > 1:
                    pol = np.polyfit(xtemp_slope, tempslope, 1)
                    if not np.isnan(pol[0]):
                        slopes.append(pol[0])
                else:
                    pol = [np.nan, np.nan]
                    print("detected short voiced segment", len(xtemp_slope))
                    #print(xtemp_slope, tempslope)
                #slopes.append(np.average(np.diff(tempslope)) / np.average(np.diff(xtemp_slope)))

                #clear the mess

                vbuffer = []

                #final time of voiced segment
                t_end_venergy = ttotal[i]
                frameF0end = i
                if np.isnan(pol[0]):
                    F0_rec[int(frameF0start):int(frameF0end)] = tempslope
                else:
                    F0_rec[int(frameF0start):int(frameF0end)] = pol[
                        0] * np.asarray(xtemp_slope) + pol[1]

                tempslope = []
                xtemp_slope = []
                startvoicedflag = True
                #calculate how many segments are in voiced time on the original audio file, based on start-end time stamps
                n_start_voiced = fs * t_start_venergy
                n_end_voiced = fs * t_end_venergy
                #calculate energy and make venergy append the result
                envoiced = logEnergy(
                    data_audio[int(n_start_voiced):int(n_end_voiced)])
                venergy.append(envoiced)

                #store last element energy in neighbor, at next iteration calculate local  and operate
                if recordneighbor:
                    recordneighbor = False
                    neighbor = logEnergy(
                        data_audio[int(n_start_voiced):int(n_end_voiced)])
                else:
                    recordneighbor = True
                    local = logEnergy(
                        data_audio[int(n_start_voiced):int(n_end_voiced)])
                    local = np.array(local)
                    neighbor = np.array(neighbor)
                    #diferencia de energia entre semgento actual y anterior ALV
                    energydifflocalneighbors.append(
                        abs(np.mean(local) - np.mean(neighbor)))

            else:
                ubuffer.append(pitch_z[i])
                #initial time of unvoiced segment
                if (startUNvoicedflag):
                    t_start_uenergy = ttotal[i]
                    startUNvoicedflag = False

    #if last segment was not computed with the next one then
    #compute it with the previous one
    start = True
    end = False
    #record last segment
    if recordneighbor == False:
        for i in range(len(pitch_z) - 1, 0):
            if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0:
                if start == True:
                    startseg = i
                    start = False
            else:
                if end == False:
                    endseg = i
                    end = True
            if (end == True):
                #retrieve from timestamp in F0 the actual time segments
                startseg = fs * ttotal[startseg]
                endseg = fs * ttotal[endseg]
                #compute energy
                lastseg = logEnergy(data_audio[int(startseg):int(endseg)])
                #cast as array
                local = np.array(lastseg)
                neighbor = np.array(neighbor)
                #take mean difference between them
                energydifflocalneighbors.append(
                    abs(np.mean(local) - np.mean(neighbor)))
                break

    energydifflocalneighbors = np.array(energydifflocalneighbors)

    voicedtimes = np.array(voicedtimes)
    unvoicedtimes = np.array(unvoicedtimes)

    silencetimes = np.array(silencetimes)
    #print(unvoicedtimes, silencetimes)
    uenergy = np.array(uenergy)
    venergy = np.array(venergy)
    """Measures"""
    """Intonation"""
    avgF0slopes = np.average(slopes)  # 1. average F0 slope
    stdF0slopes = np.std(slopes)  # 2. std F0 slope
    """Duration"""
    if ((silencetimes.size > 0)):
        SVU = (np.sum(silencetimes)) / (
            np.sum(voicedtimes) + np.sum(unvoicedtimes))  #  3.S/(V+U)
    else:
        SVU = 0
    VU = (np.sum(voicedtimes)) / np.sum(unvoicedtimes)  #  4.V/U
    UVU = np.sum(unvoicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes)
                                   )  #  5.U/(V+U)
    VVU = np.sum(voicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes)
                                 )  #  6.V/V+U

    if ((silencetimes.size > 0)):
        VS = np.sum(voicedtimes) / np.sum(silencetimes)  # 7. V/S
        US = np.sum(unvoicedtimes) / np.sum(silencetimes)  # 8. U/S
    else:
        VS = 0
        US = 0

    URD = np.std(unvoicedtimes)  # 9. (std U)
    VRD = np.std(voicedtimes)  # 10. (std V)

    URE = np.std(uenergy)  # 11. (std Energy U) wtf
    VRE = np.std(venergy)  # 12. (std Energy V)
    MSEF0 = np.mean((np.asarray(pitch_z) - np.asarray(F0_rec))**2)
    if ((silencetimes.size > 0)):  # 13. (std S)
        PR = np.std(silencetimes)
    else:
        PR = 0

    os.remove(temp_filename_f0)
    os.remove(temp_filename_vuv)

    #nextmeasures
    maxvoicedlen = np.max(voicedtimes)  #max voiced duration
    maxunvoicedlen = np.max(unvoicedtimes)  #max unvoiced duration
    minvoicedlen = np.min(voicedtimes)  #min voiced duration
    minunvoicedlen = np.min(unvoicedtimes)  #min unvoiced duration
    rvuv = len(voicedtimes) / len(
        unvoicedtimes)  #ratio voiced unvoiced segments
    #meansqrd error voiced energy segments and voiced energy segments regression coefficient
    energyslope, intercept, RegCoefenergy, p_value, std_err = st.linregress(
        venergy, np.arange(len(venergy)))
    t = np.arange(len(venergy))
    energyslope1 = np.polyval([energyslope, intercept], t)
    msqerrenergy = mean_squared_error(energyslope1, venergy)
    #mean sqrd error voiced f0 and f0 regression coefficient
    pitch_znz = pitch_z[pitch_z != minf0]
    F0slope, intercept, RegCoeff0, p_value, std_err = st.linregress(
        pitch_znz, np.arange(len(pitch_znz)))
    #neighbor segment measures
    meanNeighborenergydiff = np.mean(energydifflocalneighbors)
    stdNeighborenergydiff = np.std(energydifflocalneighbors)

    if flag_plots:
        plt.figure(1)
        plt.plot(ttotal, pitch_z, label="F0 (Hz)", linewidth=2.0)
        plt.plot(ttotal, F0_rec, label="Linear regresion F0", linewidth=2.0)
        plt.text(min(ttotal),
                 max(pitch_z) - 5, "MSE=" + str(np.round(MSEF0, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 10,
                 "Avg. tilt=" + str(np.round(avgF0slopes, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 15,
                 "Std. tilt=" + str(np.round(stdF0slopes, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 20, "R^2=" + str(np.round(RegCoeff0, 3)))

        plt.xlabel("Time (s)")
        plt.ylabel("Frequency (Hz)")
        plt.legend()

        plt.grid(True)
        plt.show()

    return avgF0slopes, stdF0slopes, MSEF0, SVU, VU, UVU, VVU, VS, US, URD, VRD, URE, VRE, PR, maxvoicedlen, maxunvoicedlen, minvoicedlen, minunvoicedlen, rvuv, energyslope, RegCoefenergy, msqerrenergy, RegCoeff0, meanNeighborenergydiff, stdNeighborenergydiff, F0_rec, pitch_z, venergy, uenergy
예제 #3
0
def prosody_static(audio, flag_plots):

    fs, data_audio = read(audio)

    data_audio = data_audio[:-1, 0]
    print(len(data_audio))
    print(data_audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = 0.02 * float(fs)
    size_stepS = 0.01 * float(fs)
    thr_len_pause = 0.14 * float(fs)
    thr_en_pause = 0.2
    overlap = size_stepS / size_frameS
    nF = int((len(data_audio) / size_frameS / overlap)) - 1
    data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
    print(data_audiof)
    print(size_stepS)
    F0 = pysptk.sptk.rapt(data_audiof,
                          fs,
                          int(size_stepS),
                          min=60,
                          max=350,
                          voice_bias=-0.2,
                          otype='f0')

    logE = []
    for l in range(nF):
        data_frame = data_audio[int(l * size_stepS):int(l * size_stepS +
                                                        size_frameS)]
        logE.append(logEnergy(data_frame))
    logE = np.asarray(logE)
    print("see")
    print(np.unique(F0))
    segmentsV = V_UV(F0,
                     data_audio,
                     fs,
                     type_seg="Voiced",
                     size_stepS=size_stepS)
    segmentsU = V_UV(F0,
                     data_audio,
                     fs,
                     type_seg="Unvoiced",
                     size_stepS=size_stepS)

    Nvoiced = len(segmentsV)
    Nunvoiced = len(segmentsU)

    Vrate = fs * float(Nvoiced) / len(data_audio)

    avgdurv = 1000 * np.mean([len(segmentsV[k])
                              for k in range(Nvoiced)]) / float(fs)
    stddurv = 1000 * np.std([len(segmentsV[k])
                             for k in range(Nvoiced)]) / float(fs)

    silence = []
    for k in range(Nunvoiced):
        eu = logEnergy(segmentsU[k])
        if (eu < thr_en_pause or len(segmentsU[k]) > thr_len_pause):
            silence.append(segmentsU[k])
    print("here")
    print(eu)
    Silrate = fs * float(len(silence)) / len(data_audio)

    avgdurs = 1000 * np.mean([len(silence[k])
                              for k in range(len(silence))]) / float(fs)
    stddurs = 1000 * np.std([len(silence[k])
                             for k in range(len(silence))]) / float(fs)

    if flag_plots:
        plt.figure(1)
        plt.subplot(311)
        t = np.arange(0, float(len(data_audio)) / fs, 1.0 / fs)
        if len(t) != len(data_audio):
            t = np.arange(1.0 / fs, float(len(data_audio)) / fs, 1.0 / fs)
        print(len(t), len(data_audio))
        plt.plot(t, data_audio, 'k')
        plt.ylabel('Amplitude')
        plt.xlabel('Time (s)')
        plt.xlim([0, t[-1]])
        plt.grid(True)
        plt.subplot(312)
        fsp = len(F0) / t[-1]
        print(fsp)
        t2 = np.arange(0.0, t[-1], 1.0 / fsp)
        if len(t2) > len(F0):
            t2 = t2[:len(F0)]
        elif len(F0) > len(t2):
            F0 = F0[:len(t2)]
        plt.plot(t2, F0, color='k', linewidth=2.0)
        plt.xlabel('Time (s)')
        plt.ylabel('F0 (Hz)')
        plt.ylim([0, np.max(F0) + 10])
        plt.xlim([0, t[-1]])
        plt.grid(True)
        plt.subplot(313)
        fse = len(logE) / t[-1]
        t3 = np.arange(0.0, t[-1], 1.0 / fse)
        if len(t3) > len(logE):
            t3 = t3[:len(logE)]
        elif len(logE) > len(t3):
            logE = logE[:len(t3)]
        plt.plot(t3, logE, color='k', linewidth=2.0)
        plt.xlabel('Time (s)')
        plt.ylabel('Energy (dB)')
        #plt.ylim([0,np.max(logE)])
        plt.xlim([0, t[-1]])
        plt.grid(True)
        plt.show()

    F0std = np.std(F0[F0 != 0])
    F0varsemi = Hz2semitones(F0std**2)

    return F0, logE, np.mean(F0[F0 != 0]), np.std(
        F0[F0 != 0]), np.max(F0), np.mean(logE), np.std(logE), np.max(
            logE
        ), Vrate, avgdurv, stddurv, Silrate, avgdurs, stddurs, F0varsemi
예제 #4
0
파일: prosody.py 프로젝트: pj1527/DisVoice
def intonation_duration(audio,
                        size_step=0.01,
                        minf0=60,
                        maxf0=350,
                        stol=0.150,
                        flag_plots=False):
    fs, data_audio = read(audio)
    temp_filename_f0 = '../tempfiles/pitchtemp.txt'
    temp_filename_vuv = '../tempfiles/voicetemp.txt'

    praat_functions.praat_vuv(audio,
                              temp_filename_f0,
                              temp_filename_vuv,
                              time_stepF0=size_step,
                              minf0=minf0,
                              maxf0=maxf0)
    pitch_z, ttotal = praat_functions.decodeF0(temp_filename_f0,
                                               len(data_audio) / fs, size_step)

    #Slopes
    slopes = []
    #buffers for voiced and unvoiced segments
    vbuffer = []
    ubuffer = []
    #energy for total voiced and unvoiced segments
    venergy = []
    uenergy = []
    #arrays for time-storing
    voicedtimes = []
    unvoicedtimes = []
    silencetimes = []
    #flag for starting point voiced time and unvoiced time
    startvoicedflag = True
    startUNvoicedflag = True
    F0_rec = np.zeros(len(pitch_z))
    for i in range(0, len(pitch_z) - 1):
        #condition for voiced segment
        if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0:
            vbuffer.append(pitch_z[i])
            #voiced segment starting time
            if (startvoicedflag):
                t_start_venergy = ttotal[i]
                startvoicedflag = False
                frameF0start = i

            if len(ubuffer) != 0:
                samples = len(ubuffer)

                t = float(samples * size_step
                          )  #unvoiced time based on F0 Fs and actual samples
                #silence condition
                if t > stol:
                    silencetimes.append(t)
                else:
                    unvoicedtimes.append(t)

                #clear the mess
                ubuffer = []
                #final time for unvoiced
                t_end_uenergy = ttotal[i]
                startUNvoicedflag = True
                #calculate segments with obtained times
                n_start_unvoiced = fs * t_start_uenergy
                n_end_unvoiced = fs * t_end_uenergy
                #energy of real audio segment based on fs and timestamp from F0
                #store
                uenergy.append(
                    logEnergy(
                        data_audio[int(n_start_unvoiced):int(n_end_unvoiced)]))
        #start appending unvoiced segments
        else:
            if (len(vbuffer) != 0):
                #based on F0 Fs and in buffer length, actual time is calculated
                samples = len(vbuffer)
                t = float(samples * size_step)
                #pick up voiced times
                voicedtimes.append(t)
                #voiced segment slope process
                #temporal x axis vector for slope calculation
                xtemp_slope = []
                tempslope = np.array(vbuffer)
                for j in range(0, len(vbuffer)):
                    xtemp_slope.append(j)
                #get slopes of voiced segments

                pol = np.polyfit(xtemp_slope, tempslope, 1)
                if np.isnan(pol[0]):
                    print("#################################")
                    print("detected short voiced segment")
                    #print(xtemp_slope, tempslope)
                else:
                    slopes.append(pol[0])
                #slopes.append(np.average(np.diff(tempslope)) / np.average(np.diff(xtemp_slope)))

                #clear the mess

                vbuffer = []

                #final time of voiced segment
                t_end_venergy = ttotal[i]
                frameF0end = i
                if np.isnan(pol[0]):
                    F0_rec[int(frameF0start):int(frameF0end)] = tempslope
                else:
                    F0_rec[int(frameF0start):int(frameF0end)] = pol[
                        0] * np.asarray(xtemp_slope) + pol[1]

                tempslope = []
                xtemp_slope = []
                startvoicedflag = True
                #calculate how many segments are in voiced time on the original audio file, based on start-end time stamps
                n_start_voiced = fs * t_start_venergy
                n_end_voiced = fs * t_end_venergy
                #calculate energy and make venergy append the result
                venergy.append(
                    logEnergy(
                        data_audio[int(n_start_voiced):int(n_end_voiced)]))
            else:
                ubuffer.append(pitch_z[i])
                #initial time of unvoiced segment
                if (startUNvoicedflag):
                    t_start_uenergy = ttotal[i]
                    startUNvoicedflag = False

    voicedtimes = np.array(voicedtimes)
    unvoicedtimes = np.array(unvoicedtimes)

    silencetimes = np.array(silencetimes)
    #print(unvoicedtimes, silencetimes)
    uenergy = np.array(uenergy)
    venergy = np.array(venergy)
    """Measures"""
    """Intonation"""
    avgF0slopes = np.average(slopes)  # 1. average F0 slope
    stdF0slopes = np.std(slopes)  # 2. std F0 slope
    """Duration"""
    if ((silencetimes.size > 0)):
        SVU = (np.sum(silencetimes)) / (
            np.sum(voicedtimes) + np.sum(unvoicedtimes))  #  3.S/(V+U)
    else:
        SVU = 0
    VU = (np.sum(voicedtimes)) / np.sum(unvoicedtimes)  #  4.V/U
    UVU = np.sum(unvoicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes)
                                   )  #  5.U/(V+U)
    VVU = np.sum(voicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes)
                                 )  #  6.V/V+U
    #si no hay silencios hay que prevenir dividir por cero
    if ((silencetimes.size > 0)):
        VS = np.sum(voicedtimes) / np.sum(silencetimes)  # 7. V/S
        US = np.sum(unvoicedtimes) / np.sum(silencetimes)  # 8. U/S
    else:
        VS = 0
        US = 0

    URD = np.std(unvoicedtimes)  # 9. (std U)
    VRD = np.std(voicedtimes)  # 10. (std V)

    URE = np.std(uenergy)  # 11. (std Energy U) wtf
    VRE = np.std(venergy)  # 12. (std Energy V)
    MSEF0 = np.mean((np.asarray(pitch_z) - np.asarray(F0_rec))**2)
    if ((silencetimes.size > 0)):  # 13. (std S)
        PR = np.std(silencetimes)
    else:
        PR = 0

    os.remove(temp_filename_f0)
    os.remove(temp_filename_vuv)

    if flag_plots:
        plt.figure(1)
        plt.plot(ttotal, pitch_z, label="F0 (Hz)", linewidth=2.0)
        plt.plot(ttotal, F0_rec, label="Linear regresion F0", linewidth=2.0)
        plt.text(min(ttotal),
                 max(pitch_z) - 5, "MSE=" + str(np.round(MSEF0, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 10,
                 "Avg. tilt=" + str(np.round(avgF0slopes, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 15,
                 "Std. tilt=" + str(np.round(stdF0slopes, 3)))
        plt.xlabel("Time (s)")
        plt.ylabel("Frequency (Hz)")
        plt.legend()

        plt.grid(True)
        plt.show()

    return avgF0slopes, stdF0slopes, MSEF0, SVU, VU, UVU, VVU, VS, US, URD, VRD, URE, VRE, PR
예제 #5
0
def prosody_static(audio, flag_plots, pitch_method='praat'):

    fs, data_audio = read(audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = 0.02 * float(fs)
    size_stepS = 0.01 * float(fs)
    thr_len_pause = 0.14 * float(fs)
    thr_en_pause = 10 * np.log10(0.02)
    overlap = size_stepS / size_frameS
    nF = int((len(data_audio) / size_frameS / overlap)) - 1

    if pitch_method == 'praat':
        temp_uuid = audio.split('/')[-1][0:-4]
        temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt'
        temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt'
        praat_functions.praat_vuv(audio,
                                  temp_filename_f0,
                                  temp_filename_vuv,
                                  time_stepF0=0.01,
                                  minf0=60,
                                  maxf0=350)

        F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                         len(data_audio) / float(fs), 0.01)
        os.remove(temp_filename_f0)

    elif pitch_method == 'rapt':
        data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
        F0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(size_stepS),
                              min=60,
                              max=350,
                              voice_bias=-0.2,
                              otype='f0')

    segmentsV = V_UV(F0,
                     data_audio,
                     fs,
                     type_seg="Voiced",
                     size_stepS=size_stepS)
    segmentsUP = V_UV(F0,
                      data_audio,
                      fs,
                      type_seg="Unvoiced",
                      size_stepS=size_stepS)

    segmentsP = []
    segmentsU = []
    for k in range(len(segmentsUP)):
        eu = logEnergy(segmentsUP[k])
        if (len(segmentsUP[k]) > thr_len_pause):
            segmentsP.append(segmentsUP[k])
        else:
            segmentsU.append(segmentsUP[k])

    F0_features = F0feat(F0)
    energy_featuresV = energy_feat(segmentsV, fs, size_frameS, size_stepS)
    energy_featuresU = energy_feat(segmentsU, fs, size_frameS, size_stepS)

    duration_features = duration_feat(segmentsV, segmentsU, segmentsP,
                                      data_audio, fs)

    if flag_plots:

        plot_pros(data_audio, fs, F0, segmentsV, segmentsU)

    features = np.hstack(
        (F0_features, energy_featuresV, energy_featuresU, duration_features))
    return features