def prosody_dynamic(self, audio): """Extract the dynamic prosody features from an audio file :param audio: .wav audio file. :returns: array (N,13) with the prosody features extracted from an audio file. N= number of voiced segments >>> prosody=Prosody() >>> file_audio="../audios/001_ddk1_PCGITA.wav" >>> features=prosody.prosody_dynamic(file_audio) """ fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = self.size_frame * float(fs) size_stepS = self.step * float(fs) overlap = size_stepS / size_frameS if self.pitch_method == 'praat': name_audio = audio.split('/') temp_uuid = 'prosody' + name_audio[-1][0:-4] if not os.path.exists(self.PATH + '/../tempfiles/'): os.makedirs(self.PATH + '/../tempfiles/') temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt' temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), self.step) os.remove(temp_filename_f0) os.remove(temp_filename_vuv) elif self.pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') #Find pitch contour of EACH voiced segment pitchON = np.where(F0 != 0)[0] dchange = np.diff(pitchON) change = np.where(dchange > 1)[0] iniV = pitchON[0] featvec = [] iniVoiced = (pitchON[0] * size_stepS) + size_stepS #To compute energy seg_voiced = [] f0v = [] Ev = [] for indx in change: finV = pitchON[indx] + 1 finVoiced = (pitchON[indx] * size_stepS) + size_stepS #To compute energy VoicedSeg = data_audio[int(iniVoiced):int( finVoiced)] #To compute energy temp = F0[iniV:finV] tempvec = [] if len(VoicedSeg) > int( size_frameS): #Take only segments greater than frame size seg_voiced.append(VoicedSeg) #Compute duration dur = len(VoicedSeg) / float(fs) #Pitch coefficients x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, self.P)) f0v.append(temp) tempvec.extend(z.coeffs) #Energy coefficients temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap) Ev.append(temp) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, self.P)) tempvec.extend(z.coeffs) tempvec.append(dur) featvec.append(tempvec) iniV = pitchON[indx + 1] iniVoiced = (pitchON[indx + 1] * size_stepS) + size_stepS #To compute energy #Add the last voiced segment finV = (pitchON[len(pitchON) - 1]) finVoiced = (pitchON[len(pitchON) - 1] * size_stepS) + size_stepS #To compute energy VoicedSeg = data_audio[int(iniVoiced):int( finVoiced)] #To compute energy temp = F0[iniV:finV] tempvec = [] if len(VoicedSeg) > int( size_frameS): #Take only segments greater than frame size #Compute duration dur = len(VoicedSeg) / float(fs) tempvec.append(dur) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, self.P)) tempvec.extend(z.coeffs) #Energy coefficients temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, self.P)) tempvec.extend(z.coeffs) #Compute duration featvec.append(tempvec) return np.asarray(featvec)
def prosody_dynamic(audio, size_frame=0.03, size_step=0.01, minf0=60, maxf0=350, voice_bias=-0.2, energy_thr_percent=0.025, P=5, pitch_method='praat'): """ Based on: Najim Dehak, "Modeling Prosodic Features With Joint Factor Analysis for Speaker Verification", 2007 """ fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = size_frame * float(fs) size_stepS = size_step * float(fs) overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) if pitch_method == 'praat': name_audio = audio.split('/') temp_uuid = 'pros' + name_audio[-1][0:-4] temp_filename_vuv = path_app + '/../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), size_step) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) elif pitch_method == 'rapt': F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=minf0, max=maxf0, voice_bias=voice_bias, otype='f0') #Find pitch contour of EACH voiced segment pitchON = np.where(F0 != 0)[0] dchange = np.diff(pitchON) change = np.where(dchange > 1)[0] iniV = pitchON[0] featvec = [] iniVoiced = (pitchON[0] * size_stepS) + size_stepS #To compute energy seg_voiced = [] f0v = [] Ev = [] for indx in change: finV = pitchON[indx] + 1 finVoiced = (pitchON[indx] * size_stepS) + size_stepS #To compute energy VoicedSeg = data_audio[int(iniVoiced):int( finVoiced)] #To compute energy temp = F0[iniV:finV] tempvec = [] if len(VoicedSeg) > int( size_frameS): #Take only segments greater than frame size seg_voiced.append(VoicedSeg) #Compute duration dur = len(VoicedSeg) / float(fs) tempvec.append(dur) #Pitch coefficients x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, P)) f0v.append(temp) #fitCoeff.append(z.coeffs) tempvec.extend(z.coeffs) #Energy coefficients temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap) Ev.append(temp) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, P)) tempvec.extend(z.coeffs) featvec.append(tempvec) iniV = pitchON[indx + 1] iniVoiced = (pitchON[indx + 1] * size_stepS) + size_stepS #To compute energy #Add the last voiced segment finV = (pitchON[len(pitchON) - 1]) finVoiced = (pitchON[len(pitchON) - 1] * size_stepS) + size_stepS #To compute energy VoicedSeg = data_audio[int(iniVoiced):int(finVoiced)] #To compute energy temp = F0[iniV:finV] tempvec = [] if len(VoicedSeg) > int( size_frameS): #Take only segments greater than frame size #Compute duration dur = len(VoicedSeg) / float(fs) tempvec.append(dur) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, P)) tempvec.extend(z.coeffs) #Energy coefficients temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, P)) tempvec.extend(z.coeffs) #Compute duration featvec.append(tempvec) if flag_plots: plot_pros(data_audio, fs, F0, seg_voiced, Ev, featvec, f0v) return np.asarray(featvec)