def extractF0_wrapper(input_wav, min_f0=60, max_f0=600, frame_length=25, frame_shift=5): """extractF0_wrapper(input_wav, min_f0 = 60, max_f0 = 600, frame_length = 25, frame_shift = 5) input ----- input_wav: string, path to the input waveform min_f0: int, minimum F0 (default 60 Hz) max_f0: int, maximum F0 (default 600 Hz) frame_length, int, analysis frame length in ms (default 25ms) frame_shift: int, analysis frame shift in ms (default 5ms) output ------ f0: np.array, f0 sequence in shape [number_of_frame] """ if os.path.isfile(input_wav): signal = basic.SignalObj(input_wav) pitch = pYAAPT.yaapt( signal, **{ 'f0_min': min_f0, 'f0_max': max_f0, 'frame_length': frame_length, 'frame_space': frame_shift }) f0_value = pitch.samp_values f0_value = numpy.array(f0_value, dtype=numpy.float32) return f0_value else: print("Cannot find {:s}".format(input_wav)) return None
def extract_pitches(): for movie in Movie.select().where(Movie.skip_line == True, Movie.skip_snippet == True, Movie.skip_pitch == False): print "Extracting pitches for '%s'" % movie.title bar = progressbar.ProgressBar() for line in bar(movie.lines.where(Line.pitch == None)): with db.transaction(): try: signal = basic.SignalObj(_full_path(line.audio)) pitch = pYAAPT.yaapt(signal) t = pitch.frames_pos / signal.fs # Gaussian filter kern = sg.gaussian(20, 2) lp = sg.filtfilt(kern, np.sum(kern), pitch.samp_interp) lp[pitch.samp_values == 0] = np.nan line.pitch = np.vstack((t, lp)) except Exception as e: print e line.pitch = None line.save() movie.skip_pitch = True movie.save()
def extractF0(input_wav, output_f0, min_f0=60, max_f0=400, frame_length=35, frame_shift=10): if os.path.isfile(input_wav): signal = basic.SignalObj(input_wav) pitch = pYAAPT.yaapt( signal, **{ 'f0_min': min_f0, 'f0_max': max_f0, 'frame_length': frame_length, 'frame_space': frame_shift }) f0_value = pitch.samp_values datatype = numpy.dtype(('<f4', 1)) f0_value = numpy.asarray(f0_value, dtype=datatype) f = open(output_f0, 'wb') f0_value.tofile(f, '') f.close() print("F0 processed: %s" % (output_f0)) else: print("Cannot find %s" % (input_wav)) return
def get_pitch_decompy_values(wav, remove_silencess = True, interpolate = True): #print('get_pitch_decompy_values\npath = {}\n'.format(wav)) signal = basic.SignalObj(wav) ''' plt.title('signal') plt.plot(signal.data, color = 'm') ''' pitch = pYAAPT.yaapt(signal) ynew = pitch.samp_values rv = len(pitch.samp_values) sv = 0 iv = 0 # toma el pitch quita los silencios del inicio y el final y aplica # spline_interpolation para quitar rellenar por interpolacion los silencios intermedios if remove_silencess: ynew, _, _ = remove_silence(ynew) sv = len(ynew) #print(' y_before_remove_silence = {}'.format(len(pitch.samp_values))) #print(' y_to_spline_len = {}'.format(len(ynew))) if interpolate: #print(' interpolating') #_, _, ynew, _ = spline_interpolation(ynew) ynew = spline_interpolation(ynew) iv = len(ynew) return ynew
def extract_and_analyze_pitches(): for movie in Movie.select().where(Movie.skip_line == True, Movie.skip_snippet == True, Movie.skip_pitch == False): print "Extracting pitches for '%s'" % movie.title bar = progressbar.ProgressBar() for line in bar(movie.lines.where(Line.pitch == None)): with db.transaction(): try: signal = basic.SignalObj(_full_path(line.audio)) pitch = pYAAPT.yaapt(signal) # Gaussian filter kern = sg.gaussian(20, 2) lp = sg.filtfilt(kern, np.sum(kern), pitch.samp_interp) lp[pitch.samp_values == 0] = np.nan kde = st.gaussian_kde(lp[~np.isnan(lp)]) locs = np.linspace(50, 400, 100) vals = kde.evaluate(locs) peak = locs[np.argmax(vals)] line.sextimate = peak except Exception as e: print e line.pitch = None line.save() movie.skip_pitch = True movie.save()
def audio_pitch(file_name, demo_path, media_root): # load audio source_path = '{media_root}/dtw/{file_name}.wav' source_path = source_path.format(media_root=media_root, file_name=file_name) signal_source = basic.SignalObj(source_path) signal_target = basic.SignalObj(demo_path) # YAAPT pitches pitches_source = pYAAPT.yaapt(signal_source, frame_length=40, tda_frame_length=40, fft_length=2048, f0_min=75, f0_max=600) pitches_target = pYAAPT.yaapt(signal_target, frame_length=40, tda_frame_length=40, fft_length=2048, f0_min=75, f0_max=600) # Main wav, fs = librosa.load(source_path, sr=None) output = numpy.full(shape=(len(wav)), fill_value=0, dtype='float32') length = 4096 for i in range(0, len(wav), int(length / 6)): # time: /10ms time = int(i / fs * 100) if (time < len(pitches_source.samp_values) and time < len(pitches_target.samp_values)): source_pitch = pitches_source.samp_values[time] target_pitch = pitches_target.samp_values[time] # 底数常量为相邻两个音高频率关系 # 例如:A3音符与A4的频率分别为220.0Hz,440.0Hz, 根据十二平均律已知两个音音阶差为12,设常量为t,频率关系则为 440 = 220 * t ** 12, t = pow(440 / 220, 1.0/12) n_steps = 0 if (source_pitch != 0 and target_pitch != 0): n_steps = math.log(target_pitch / source_pitch, pow(2, 1.0 / 12)) new_frame = librosa.effects.pitch_shift(y=numpy.hanning(len(wav[i:i + length])) * wav[i:i + length], sr=fs, n_steps=n_steps) output[i:i + length] += new_frame # 混响效果,听起来效果不是很好。 # fx = ( # AudioEffectsChain() # .highshelf() # .reverb() # # .phaser() # .lowshelf() # ) # output = fx(output) # 写入文件 output_path = '{media_root}/pitch/{file_name}.wav' output_path = output_path.format(media_root=media_root, file_name=file_name) librosa.output.write_wav(output_path, output, fs)
def preprocessing(ii): fname = nameNsize[ii][0] line = nameNsize[ii][1] phone = [] for item in line.split(' '): temp = G2P(item) phone += temp.split() if item.find('!') != -1: phone += '!' elif item.find('?') != -1: phone += '?' elif item.find('.') != -1: phone += '.' elif item.find(',') != -1: phone += ',' phone += ' ' text = phone[:-1] + ['E'] dic = [ 'P', '.', '!', '?', ',', 'k0', 'kk', 'nn', 't0', 'tt', 'rr', 'mm', 'p0', 'pp', 's0', 'ss', 'oh', 'c0', 'cc', 'ch', 'kh', 'th', 'ph', 'h0', 'aa', 'qq', 'ya', 'yq', 'vv', 'ee', 'yv', 'ye', 'oo', 'wa', 'wq', 'wo', 'yo', 'uu', 'wv', 'we', 'wi', 'yu', 'xx', 'xi', 'ii', '', 'kf', 'ks', 'nf', 'nc', 'ng', 'nh', 'tf', 'll', 'lk', 'lm', 'lb', 'ls', 'lt', 'lp', 'lh', 'mf', 'pf', 'ps', ' ', 'E' ] char2idx = {ch: idx for idx, ch in enumerate(dic)} emblen = len(char2idx) txt = np.asarray([char2idx[ch] for ch in text]) audio, sr = librosa.load('../kss/{}'.format(fname), sr=22050) audio, index = librosa.effects.trim(audio, top_db=43, frame_length=256, hop_length=64) audioobj = basic.SignalObj(audio, sr) pitch = pYAAPT.yaapt( audioobj, **{ 'f0_min': 100.0, 'frame_length': 1000 * 1024 // 22050, 'frame_space': 1000 * 256 // 22050 }) pitch = pitch.samp_values stft = np.abs( librosa.stft(audio, n_fft=1024, hop_length=256, win_length=1024)) stft = np.power(stft / np.max(stft), 0.6) mel_filters = librosa.filters.mel(22050, 1024, 80) mel = np.dot(mel_filters, stft) mel = np.power(mel / np.max(mel), 0.6) mel = mel[:, ::4] pitch = pitch[::4] length = np.min([np.shape(mel)[1], np.shape(stft)[1] // 4]) mel = mel[:, :length] stft = stft[:, :length * 4] pitch = pitch[:length] np.save('data/sample_{}.npy'.format((str)(ii).zfill(5)), (txt, len(txt), mel, stft, mel.shape[1], pitch)) print('\r{}saved'.format(ii), end='')
def getf0_viaYAAPT(music_loc): signal = basic.SignalObj(music_loc) pitch = pYAAPT.yaapt(signal, **{'f0_max': 2600.0}) pitch_interp = pitch.samp_interp pitch_interp = lpf(pitch_interp, 15, 100) pitch_interp = medfilt(pitch_interp, 45) return pitch_interp.tolist()
def extract_pitch(path): """ Method to extract pitch values and energy :param path: path to the audio file :return: pitch values and pitch energy, averaged over number of frames """ signal = basic.SignalObj(path) pitch = pYAAPT.yaapt(signal) avg_pitch = sum(map(np.array, pitch.samp_values)) / pitch.nframes avg_pitch_energy = sum(map(np.array, pitch.energy)) / pitch.nframes return avg_pitch, avg_pitch_energy
def convert_wav_to_f0_ascii(fname, fs, directory=''): print 'convert ', fname # Create the signal object. signal = basic.SignalObj(fname) # Get time interval and num_samples t_start = 0.0 num_samples = signal.size t_end = num_samples / signal.fs t = np.linspace(t_start, t_end, num_samples) # Create the pitch object and calculate its attributes. pitch = pyaapt.yaapt(signal) # Create .f0_ascii file and dump f0 to it output_fname = directory+os.path.splitext(os.path.basename(fname))[0]+'.f0_ascii' with open(output_fname, 'wb') as f: for i in range(pitch.nframes): f0 = pitch.samp_values[i] vu = 1.0 if pitch.vuv[i] else 0.0 fe = pitch.energy[i] * pitch.mean_energy line = '{} {} {} {}\n'.format(f0, vu, fe, vu) f.write(line) step = signal.fs / fs output_f0 = pitch.values_interp[0:signal.size:step] return (os.path.splitext(os.path.basename(fname))[0], output_f0)
def get_f0(audio, rate=16000): try: import amfm_decompy.basic_tools as basic import amfm_decompy.pYAAPT as pYAAPT from librosa.util import normalize except ImportError: raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)." assert audio.ndim == 1 frame_length = 20.0 # ms to_pad = int(frame_length / 1000 * rate) // 2 audio = normalize(audio) * 0.95 audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0) audio = basic.SignalObj(audio, rate) pitch = pYAAPT.yaapt( audio, frame_length=frame_length, frame_space=F0_FRAME_SPACE * 1000, nccf_thresh1=0.25, tda_frame_length=25.0, ) f0 = pitch.samp_values return f0
def get_pitch_decompy_int(wav): signal = basic.SignalObj(wav) pitch = pYAAPT.yaapt(signal) return remove_silence(pitch.samp_interp)
speakers = ['awb','bdl','clb','jmk','ksp','rms','slt'] root = os.getcwd() folderpath = os.path.join(root,'datasets',speakers[0],'wav') files = sorted(os.listdir(folderpath)) # Read the files for file in files: file = os.path.join(folderpath,file) fs,audio = wavread(file) break # IPython.display.Audio(file) # YAAPT pitches signal = basic.SignalObj(file) pitchY = pYAAPT.yaapt(signal, frame_length=25, frame_space=5, f0_min=40, f0_max=300) plt.plot(pitchY.values_interp, label='YAAPT', color='blue') plt.xlabel('samples') plt.ylabel('pitch (Hz)')' #
def yaapt(signal, **kwargs): # Rename the YAAPT v4.0 parameter "frame_lengtht" to "tda_frame_length" # (if provided). if 'frame_lengtht' in kwargs: if 'tda_frame_length' in kwargs: warning_str = 'WARNING: Both "tda_frame_length" and "frame_lengtht" ' warning_str += 'refer to the same parameter. Therefore, the value ' warning_str += 'of "frame_lengtht" is going to be discarded.' print(warning_str) else: kwargs['tda_frame_length'] = kwargs.pop('frame_lengtht') #--------------------------------------------------------------- # Set the default values for the parameters. #--------------------------------------------------------------- parameters = {} parameters['frame_length'] = kwargs.get( 'frame_length', 35.0) #Length of each analysis frame (ms) # WARNING: In the original MATLAB YAAPT 4.0 code the next parameter is called # "frame_lengtht" which is quite similar to the previous one "frame_length". # Therefore, I've decided to rename it to "tda_frame_length" in order to # avoid confusion between them. Nevertheless, both inputs ("frame_lengtht" # and "tda_frame_length") are accepted when the function is called. parameters['tda_frame_length'] = \ kwargs.get('tda_frame_length', 35.0) #Frame length employed in the time domain analysis (ms) parameters['frame_space'] = kwargs.get( 'frame_space', 10.0) #Spacing between analysis frames (ms) parameters['f0_min'] = kwargs.get('f0_min', 60.0) #Minimum F0 searched (Hz) parameters['f0_max'] = kwargs.get('f0_max', 400.0) #Maximum F0 searched (Hz) parameters['fft_length'] = kwargs.get('fft_length', 8192) #FFT length parameters['bp_forder'] = kwargs.get('bp_forder', 150) #Order of band-pass filter parameters['bp_low'] = kwargs.get( 'bp_low', 50.0) #Low frequency of filter passband (Hz) parameters['bp_high'] = kwargs.get( 'bp_high', 1500.0) #High frequency of filter passband (Hz) parameters['nlfer_thresh1'] = kwargs.get( 'nlfer_thresh1', 0.75) #NLFER boundary for voiced/unvoiced decisions parameters['nlfer_thresh2'] = kwargs.get( 'nlfer_thresh2', 0.1) #Threshold for NLFER definitely unvoiced parameters['shc_numharms'] = kwargs.get( 'shc_numharms', 3) #Number of harmonics in SHC calculation parameters['shc_window'] = kwargs.get('shc_window', 40.0) #SHC window length (Hz) parameters['shc_maxpeaks'] = kwargs.get( 'shc_maxpeaks', 4) #Maximum number of SHC peaks to be found parameters['shc_pwidth'] = kwargs.get( 'shc_pwidth', 50.0) #Window width in SHC peak picking (Hz) parameters['shc_thresh1'] = kwargs.get( 'shc_thresh1', 5.0) #Threshold 1 for SHC peak picking parameters['shc_thresh2'] = kwargs.get( 'shc_thresh2', 1.25) #Threshold 2 for SHC peak picking parameters['f0_double'] = kwargs.get( 'f0_double', 150.0) #F0 doubling decision threshold (Hz) parameters['f0_half'] = kwargs.get( 'f0_half', 150.0) #F0 halving decision threshold (Hz) parameters['dp5_k1'] = kwargs.get('dp5_k1', 11.0) #Weight used in dynamic program parameters['dec_factor'] = kwargs.get('dec_factor', 1) #Factor for signal resampling parameters['nccf_thresh1'] = kwargs.get( 'nccf_thresh1', 0.3) #Threshold for considering a peak in NCCF parameters['nccf_thresh2'] = kwargs.get( 'nccf_thresh2', 0.9) #Threshold for terminating serach in NCCF parameters['nccf_maxcands'] = kwargs.get( 'nccf_maxcands', 3) #Maximum number of candidates found parameters['nccf_pwidth'] = kwargs.get( 'nccf_pwidth', 5) #Window width in NCCF peak picking parameters['merit_boost'] = kwargs.get('merit_boost', 0.20) #Boost merit parameters['merit_pivot'] = kwargs.get( 'merit_pivot', 0.99) #Merit assigned to unvoiced candidates in #defintely unvoiced frames parameters['merit_extra'] = kwargs.get( 'merit_extra', 0.4) #Merit assigned to extra candidates #in reducing F0 doubling/halving errors parameters['median_value'] = kwargs.get('median_value', 7) #Order of medial filter parameters['dp_w1'] = kwargs.get( 'dp_w1', 0.15) #DP weight factor for V-V transitions parameters['dp_w2'] = kwargs.get( 'dp_w2', 0.5) #DP weight factor for V-UV or UV-V transitions parameters['dp_w3'] = kwargs.get( 'dp_w3', 0.1) #DP weight factor of UV-UV transitions parameters['dp_w4'] = kwargs.get('dp_w4', 0.9) #Weight factor for local costs #--------------------------------------------------------------- # Create the signal objects and filter them. #--------------------------------------------------------------- fir_filter = BandpassFilter(signal.fs, parameters) nonlinear_sign = basic.SignalObj(signal.data**2, signal.fs) signal.filtered_version(fir_filter) nonlinear_sign.filtered_version(fir_filter) #--------------------------------------------------------------- # Create the pitch object. #--------------------------------------------------------------- nfft = parameters['fft_length'] frame_size = int(np.fix(parameters['frame_length'] * signal.fs / 1000)) frame_jump = int(np.fix(parameters['frame_space'] * signal.fs / 1000)) pitch = PitchObj(frame_size, frame_jump, nfft) assert pitch.frame_size > 15, 'Frame length value {} is too short.'.format( pitch.frame_size) assert pitch.frame_size < 2048, 'Frame length value {} exceeds the limit.'.format( pitch.frame_size) #--------------------------------------------------------------- # Calculate NLFER and determine voiced/unvoiced frames. #--------------------------------------------------------------- nlfer(signal, pitch, parameters) #--------------------------------------------------------------- # Calculate an approximate pitch track from the spectrum. #--------------------------------------------------------------- spec_pitch, pitch_std = spec_track(nonlinear_sign, pitch, parameters) #--------------------------------------------------------------- # Temporal pitch tracking based on NCCF. #--------------------------------------------------------------- time_pitch1, time_merit1 = time_track(signal, spec_pitch, pitch_std, pitch, parameters) time_pitch2, time_merit2 = time_track(nonlinear_sign, spec_pitch, pitch_std, pitch, parameters) # Added in YAAPT 4.0 if time_pitch1.shape[1] < len(spec_pitch): len_time = time_pitch1.shape[1] len_spec = len(spec_pitch) time_pitch1 = np.concatenate( (time_pitch1, np.zeros((3, len_spec - len_time), dtype=time_pitch1.dtype)), axis=1) time_pitch2 = np.concatenate( (time_pitch2, np.zeros((3, len_spec - len_time), dtype=time_pitch2.dtype)), axis=1) time_merit1 = np.concatenate( (time_merit1, np.zeros((3, len_spec - len_time), dtype=time_merit1.dtype)), axis=1) time_merit2 = np.concatenate( (time_merit2, np.zeros((3, len_spec - len_time), dtype=time_merit2.dtype)), axis=1) #--------------------------------------------------------------- # Refine pitch candidates. #--------------------------------------------------------------- ref_pitch, ref_merit = refine(time_pitch1, time_merit1, time_pitch2, time_merit2, spec_pitch, pitch, parameters) #--------------------------------------------------------------- # Use dyanamic programming to determine the final pitch. #--------------------------------------------------------------- final_pitch = dynamic(ref_pitch, ref_merit, pitch, parameters) pitch.set_values(final_pitch, signal.size) return pitch
def voiceRecognition(): def int_or_str(text): """Helper function for argument parsing.""" try: return int(text) except ValueError: return text parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-l', '--list-devices', action='store_true', help='show list of audio devices and exit') parser.add_argument('-d', '--device', type=int_or_str, help='input device (numeric ID or substring)') parser.add_argument('-r', '--samplerate', type=int, help='sampling rate') parser.add_argument('-c', '--channels', type=int, default=1, help='number of input channels') parser.add_argument('filename', nargs='?', metavar='FILENAME', help='audio file to store recording to') parser.add_argument('-t', '--subtype', type=str, help='sound file subtype (e.g. "PCM_24")') args = parser.parse_args() # AUDIO_CAPTURING import sounddevice as sd import soundfile as sf import numpy # Make sure NumPy is loaded before it is used in the callback assert numpy # avoid "imported but unused" message (W0611) if args.list_devices: print(sd.query_devices()) parser.exit(0) if args.samplerate is None: device_info = sd.query_devices(args.device, 'input') # soundfile expects an int, sounddevice provides a float: args.samplerate = int(device_info['default_samplerate']) if args.filename is None: args.filename = tempfile.mktemp(prefix='candidate_recording', suffix='.wav', dir='') q = queue.Queue() def callback(indata, frames, time, status): """This is called (from a separate thread) for each audio block.""" if status: print(status, file=sys.stderr) q.put(indata.copy()) # Make sure the file is opened before recording anything: with sf.SoundFile(args.filename, mode='x', samplerate=args.samplerate, channels=args.channels, subtype=args.subtype) as file: with sd.InputStream(samplerate=args.samplerate, device=args.device, channels=args.channels, callback=callback): print('#' * 80) print('press q to stop the recording') print('#' * 80) # while True: # file.write(q.get()) i = 0 while True: # making a loop #print(i) i = i + 1 file.write(q.get()) try: # used try so that if user pressed other than the given key error will not be shown if keyboard.is_pressed('q'): # if key 'q' is pressed print('You Pressed A Key!') break # finishing the loop else: pass except: break # except KeyboardInterrupt: print('\nRecording finished: ' + repr(args.filename)) #Python program to transcribe an Audio file AUDIO_FILE = (args.filename) # use the audio file as the audio source r = sr.Recognizer() with sr.AudioFile(AUDIO_FILE) as source: #reads the audio file. Here we use record instead of #listen audio = r.record(source) try: text = r.recognize_google(audio) print("The audio file contains: " + text) except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print( "Could not request results from Google Speech Recognition service; {0}" .format(e)) # write audio to a WAV file with open("microphone-results-11223344.wav", "wb") as f: f.write(audio.get_wav_data()) # write the converted text to a TXT file t = open('microphone-results-11223344.txt', 'a') t.write(text) t.close() # PITCH_TRACKING signal = basic.SignalObj('microphone-results-11223344.wav') pitch = pYAAPT.yaapt(signal) #print(pitch.samp_values) print(len(pitch.samp_values)) non_zero_pitch = [] for i in range(len(pitch.samp_values)): if pitch.samp_values[i] > 0: non_zero_pitch.append(pitch.samp_values[i]) print("*****************************") #print(non_zero_pitch) print(len(non_zero_pitch)) high = [] low = [] for i in range(len(non_zero_pitch)): if non_zero_pitch[i] > 255: high.append(non_zero_pitch[i]) elif non_zero_pitch[i] < 85: low.append(non_zero_pitch[i]) avg_pitch = np.mean(non_zero_pitch) print("The average pitch value is: ", avg_pitch) if 85 <= avg_pitch <= 255: print("Appropriate Pitch Maintained", len(high), len(low)) # GAPS_IN_AUDIO AudioSegment.converter = r"C:\\ffmpeg\\bin\\ffmpeg.exe" myaudio = AudioSegment.from_wav("microphone-results-11223344.wav") silent = silence.detect_silence(myaudio, min_silence_len=100, silence_thresh=-40) silent = [((start / 1000), (stop / 1000)) for start, stop in silent] #convert to sec print("************************") print(silent) silent = np.asarray(silent) print(silent) print(silent.shape) diff = [] count = 0 for i in range(len(silent)): sub = silent[i][1] - silent[i][0] diff.append(sub) for i in range(len(diff)): if diff[i] > 1.3: count += 1 print("Gaps greater than 1.3 seconds: ", count, " times") # POLARITY_CALCULATION f = open("microphone-results-11223344.txt", "r") if f.mode == 'r': contents = f.read() blob = TextBlob(contents) print("The Polarity of the recorded transcript is: ") for sentence in blob.sentences: print(sentence.sentiment.polarity) # SPEECH_RATE num_words = 0 with open("microphone-results-11223344.txt", 'r') as f: for line in f: words = line.split() num_words += len(words) print("Number of words:", num_words) data_, sampling_rate_ = librosa.load("microphone-results-11223344.wav", sr=44100) secs = np.size(data_) / sampling_rate_ print("Audio Length: ", str(secs), " seconds") silent_zones = np.sum(diff) eff_diff = secs - silent_zones print("Effective non-silent time period is: ", eff_diff) speech_rate = math.ceil((num_words / eff_diff) * 60) print("Speech rate is {} words per minute".format(speech_rate)) if speech_rate < 110: print("Not a good speech rate: ", speech_rate) elif speech_rate >= 110 and speech_rate <= 165: print("Perfect speech rate: ", speech_rate) else: print("Very fast, either nervous or too excited: ", speech_rate) parser.exit(0)
import numpy as np import pandas as pd import pitch import tensorflow as tf def get_tone_digit(tone_list: dict) -> int: for key, value in tone_list.items(): if value == 1: return int(key[-1]) f0_df = pd.DataFrame() for val in range(0, 3108): signal = basic.SignalObj( '/home/dattilo/Documents/Project/Data Sources/Audio2-0/Audio2-' + str(val + 1).zfill(2) + '.wav') pitchY = pYAAPT.yaapt(signal, frame_length=40, tda_frame_length=40, f0_min=75, f0_max=600) # YIN pitches f0_df = f0_df.append(pd.DataFrame([[val] + pitchY.samp_values.tolist()])) text_df = pd.read_csv( '/home/dattilo/Documents/Project/Data Sources/truyenkieuwordnumber.txt', sep=' ', names=['index', 'word']) text_df['tone_2'] = ( text_df['word'].str.contains('á|é|í|ó|ú|ý|ắ|ấ|ế|ố|ớ|ứ')).astype(int) text_df['tone_3'] = ( text_df['word'].str.contains('à|è|ì|ò|ù|ỳ|ằ|ầ|ề|ồ|ờ|ừ')).astype(int)
from schema import Line import matplotlib.pyplot as plt import numpy as np from scipy import signal as sg from scipy import stats as st plt.style.use('ggplot') import amfm_decompy.pYAAPT as pYAAPT import amfm_decompy.basic_tools as basic lines = Line.select().where(Line.id << sys.argv[1:]) for line in lines: signal = basic.SignalObj(line.audio) pitch = pYAAPT.yaapt(signal) t = pitch.frames_pos / signal.fs p = pitch.samp_interp p[p == 0] = np.nan fig, (a,b) = plt.subplots(1, 2, sharey=True, num="#%i: " % (line.id) \ + line.text.replace('\n', ' '), figsize=(10,5), ) #a.plot(t, p, lw=1) a.set_ylim(50, 400) a.set_xlim(right=np.max(t))
import preprocessing import feature_extraction as fea import scipy path = '..\\..\\boy_and_girl\\class1\\arctic_a0001.wav' audio_data, sample_rate = lib.load( path, sr=None, mono=True, res_type='kaiser_best') # 读取文件 silence_remove = preprocessing.silence_remove( x=audio_data, limit=np.max(audio_data) / 20 * 2, fs=sample_rate, option='HF', # pic=savepic + '\\' + 'silence_remove_hilbert_' + str(j)+'_'+str(i)) pic=None) signal = basic.SignalObj(silence_remove, sample_rate) frame_time = 30.0 frame_length = int(0.03 * sample_rate) frame_overlap = frame_length // 2 + 1 params = {'frame_length': frame_time, 'tda_frame_length': frame_time, 'frame_space': frame_time/2, 'f0_min': 50.0, 'f0_max': 1000.0, 'fft_length': 8192, 'bp_forder': 150, # 带通滤波器阶数 'bp_low': 50.0, 'bp_hign': 1500.0, 'nlfer_thresh1': 0.75, # 0.75 'nlfer_thresh2': 0.1,
print("f0_to_pac started") f0_fnames = utils.get_file_list(directory, '.f0_ascii') with open(directory + 'dump.txt', 'w') as dumpfile: for fname in f0_fnames: fuj_utils.convert_f0_ascii_to_pac(fname, autofuji_fname, directory) # thresh = 0.0001 # alpha = 2.0 # args ="{} 0 4 {} auto {}".format(directory+fname, thresh, alpha) # subprocess.call(autofuji_fname+" "+args) print("{} f0_to_pac completed".format(fname)) print("f0_to_pac completed") # Declare the variables. file_name = "Ses01F_script01_1_M035.wav" # Create the signal object. signal = basic.SignalObj(directory + file_name) # Get time interval and num_samples t_start = 0.0 num_samples = signal.size t_end = num_samples / signal.fs t = np.linspace(t_start, t_end, num_samples) # Create the pitch object and calculate its attributes. pitch = pyaapt.yaapt(signal) with open('Ses01F_script01_1_M035.f0_ascii', 'wb') as f: for i in range(pitch.nframes): f0 = pitch.samp_values[i] vu = 1.0 if pitch.vuv[i] else 0.0 fe = pitch.energy[i] * pitch.mean_energy line = '{} {} {} {}\n'.format(f0, vu, fe, vu) f.write(line)
# matplotlib.interactive(True) # y, sr = librosa.load(librosa.util.example_audio_file(), duration=10) y, sr = librosa.core.load(audio_filesd[1], sr=44100, duration=10) print(audio_filesd[0]) D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) # plt.subplot(4, 2, 1) librosa.display.specshow(D, y_axis='linear') plt.colorbar(format='%+2.0f dB') plt.title('Linear-frequency power spectrogram') # signal = basic.SignalObj('../') filename = '../wav_temp/1268690716832_48601.wav' signal = basic.SignalObj('../wav_temp/1268690716832_48601.wav') downsample = 1 samplerate = 0 win_s = 1764 // downsample # fft size hop_s = 441 // downsample # hop size s = source(filename, samplerate, hop_s) samplerate = s.samplerate tolerance = 0.8 pitch_o = pitch("yin", win_s, hop_s, samplerate) pitch_o.set_unit("midi") pitch_o.set_tolerance(tolerance) pitchesYIN = [] confidences = [] total_frames = 0 while True:
def yaapt(signal, **kwargs): #--------------------------------------------------------------- # Set the default values for the parameters. #--------------------------------------------------------------- parameters = {} parameters['frame_length'] = kwargs.get('frame_length', 25.0) #Length of each analysis frame (ms) parameters['frame_space'] = kwargs.get('frame_space', 10.0) #Spacing between analysis frames (ms) parameters['f0_min'] = kwargs.get('f0_min', 60.0) #Minimum F0 searched (Hz) parameters['f0_max'] = kwargs.get('f0_max', 400.0) #Maximum F0 searched (Hz) parameters['fft_length'] = kwargs.get('fft_length', 8192) #FFT length parameters['bp_forder'] = kwargs.get('bp_forder', 150) #Order of band-pass filter parameters['bp_low'] = kwargs.get('bp_low', 50.0) #Low frequency of filter passband (Hz) parameters['bp_high'] = kwargs.get('bp_high', 1500.0) #High frequency of filter passband (Hz) parameters['nlfer_thresh1'] = kwargs.get('nlfer_thresh1', 0.75) #NLFER boundary for voiced/unvoiced decisions parameters['nlfer_thresh2'] = kwargs.get('nlfer_thresh2', 0.1) #Threshold for NLFER definitely unvoiced parameters['shc_numharms'] = kwargs.get('shc_numharms', 3) #Number of harmonics in SHC calculation parameters['shc_window'] = kwargs.get('shc_window', 40.0) #SHC window length (Hz) parameters['shc_maxpeaks'] = kwargs.get('shc_maxpeaks', 4) #Maximum number of SHC peaks to be found parameters['shc_pwidth'] = kwargs.get('shc_pwidth', 50.0) #Window width in SHC peak picking (Hz) parameters['shc_thresh1'] = kwargs.get('shc_thresh1', 5.0) #Threshold 1 for SHC peak picking parameters['shc_thresh2'] = kwargs.get('shc_thresh2', 1.25) #Threshold 2 for SHC peak picking parameters['f0_double'] = kwargs.get('f0_double', 150.0) #F0 doubling decision threshold (Hz) parameters['f0_half'] = kwargs.get('f0_half', 150.0) #F0 halving decision threshold (Hz) parameters['dp5_k1'] = kwargs.get('dp5_k1', 11.0) #Weight used in dynamic program parameters['dec_factor'] = kwargs.get('dec_factor', 1) #Factor for signal resampling parameters['nccf_thresh1'] = kwargs.get('nccf_thresh1', 0.25) #Threshold for considering a peak in NCCF parameters['nccf_thresh2'] = kwargs.get('nccf_thresh2', 0.9) #Threshold for terminating serach in NCCF parameters['nccf_maxcands'] = kwargs.get('nccf_maxcands', 3) #Maximum number of candidates found parameters['nccf_pwidth'] = kwargs.get('nccf_pwidth', 5) #Window width in NCCF peak picking parameters['merit_boost'] = kwargs.get('merit_boost', 0.20) #Boost merit parameters['merit_pivot'] = kwargs.get('merit_pivot', 0.99) #Merit assigned to unvoiced candidates in #defintely unvoiced frames parameters['merit_extra'] = kwargs.get('merit_extra', 0.4) #Merit assigned to extra candidates #in reducing F0 doubling/halving errors parameters['median_value'] = kwargs.get('median_value', 7) #Order of medial filter parameters['dp_w1'] = kwargs.get('dp_w1', 0.15) #DP weight factor for V-V transitions parameters['dp_w2'] = kwargs.get('dp_w2', 0.5) #DP weight factor for V-UV or UV-V transitions parameters['dp_w3'] = kwargs.get('dp_w3', 0.1) #DP weight factor of UV-UV transitions parameters['dp_w4'] = kwargs.get('dp_w4', 0.9) #Weight factor for local costs #--------------------------------------------------------------- # Create the signal objects and filter them. #--------------------------------------------------------------- fir_filter = BandpassFilter(signal.fs, parameters) nonlinear_sign = basic.SignalObj(signal.data**2, signal.fs) signal.filtered_version(fir_filter) nonlinear_sign.filtered_version(fir_filter) #--------------------------------------------------------------- # Create the pitch object. #--------------------------------------------------------------- nfft = parameters['fft_length'] frame_size = int(np.fix(parameters['frame_length']*signal.fs/1000)) frame_jump = int(np.fix(parameters['frame_space']*signal.fs/1000)) pitch = PitchObj(frame_size, frame_jump, nfft) if pitch.frame_size < 15: print 'Frame length value {} is too short.'.format(nframe_size) interrupt_main() elif pitch.frame_size > 2048: print 'Frame length value {} exceeds the limit.'.format(nframe_size) interrupt_main() #--------------------------------------------------------------- # Calculate NLFER and determine voiced/unvoiced frames. #--------------------------------------------------------------- nlfer(signal, pitch, parameters) #--------------------------------------------------------------- # Calculate an approximate pitch track from the spectrum. #--------------------------------------------------------------- spec_pitch, pitch_std = spec_track(nonlinear_sign, pitch, parameters) #--------------------------------------------------------------- # Temporal pitch tracking based on NCCF. #--------------------------------------------------------------- time_pitch1, time_merit1 = time_track(signal, spec_pitch, pitch_std, pitch, parameters) time_pitch2, time_merit2 = time_track(nonlinear_sign, spec_pitch, pitch_std, pitch, parameters) #--------------------------------------------------------------- # Refine pitch candidates. #--------------------------------------------------------------- ref_pitch, ref_merit = refine(time_pitch1, time_merit1, time_pitch2, time_merit2, spec_pitch, pitch, parameters) #--------------------------------------------------------------- # Use dyanamic programming to determine the final pitch. #--------------------------------------------------------------- final_pitch = dynamic(ref_pitch, ref_merit, pitch, parameters) pitch.set_values(final_pitch, signal.size) return pitch
11/Mar/2020 Bernardo J.B. Schmitt - [email protected] """ import amfm_decompy import amfm_decompy.pYAAPT as pyaapt import amfm_decompy.pyQHM as pyqhm import amfm_decompy.basic_tools as basic import os.path # Declare the variables. file_name = os.path.dirname(amfm_decompy.__file__) + os.sep + "sample.wav" window_duration = 0.015 # in seconds nharm_max = 25 SNR = float('Inf') # Create the signal object. signal = basic.SignalObj(file_name) # Create the window object. window = pyqhm.SampleWindow(window_duration, signal.fs) # Create the pitch object and calculate its attributes. pitch = pyaapt.yaapt(signal) # Set the number of modulated components. signal.set_nharm(pitch.values, nharm_max) # Check if gaussian noise has to be added. if SNR != float('Inf'): signal.noiser(pitch.values, SNR) # Perform the QHM extraction.
def preparePitchFeatureVector(filename): ## Find a better way to get pitches for our audio clip ## Function Prototype : pysptk.sptk.rapt(x, fs, hopsize, min=60, max=240, voice_bias=0.0, otype='f0') ## The current is removing zeroed (unvoiced) pitches ## Still have to understand voiced and unvoiced data from audio # pitches = pysptk.sptk.rapt(x=soundData, fs=sampleRate, hopsize=totalFrames, min=60, max=250, voice_bias=0.5, otype="f0") # pitches = pitches[np.nonzero(pitches)] '''-----------------------------------------------------PITCH FEATURES-----------------------------------------------------''' ## Example : pitch = pYAAPT.yaapt(signal, **{'f0_min' : 150.0, 'frame_length' : 15.0, 'frame_space' : 5.0}) signal = basic.SignalObj(filename) pitch = pYAAPT.yaapt( signal, **{ 'f0_min': 60.0, 'f0_max': 360, 'frame_length': 25.0, 'frame_space': 10.0 }) pitches = pitch.samp_values testSampleRate, testSoundData = wav.read(filename) getPitchesForEachFrame(testSoundData, testSampleRate) ## Getting the voiced part of the sound clip boolVoiced = np.array([]) for i in range(len(pitches)): if (pitches[i] == 0): boolVoiced = np.append(boolVoiced, 0) else: boolVoiced = np.append(boolVoiced, 1) derivativeOfPitches = np.array([]) counter = 0 for i in pitches: if counter == 0: delta = i derivativeOfPitches = np.append(derivativeOfPitches, delta) else: delta = i - prev derivativeOfPitches = np.append(derivativeOfPitches, delta) prev = i counter += 1 derivativeOfPitches = np.array( np.split(derivativeOfPitches, pitches.shape[0])) # print(boolVoiced) # sampleRate, soundData = wav.read(filename) # plt.subplot(2, 2, 1) # plt.plot(soundData) # plt.subplot(2, 2, 3) # plt.plot(pitch.samp_values) # plt.subplot(2, 2, 4) # plt.plot(boolVoiced) # plt.subplot(2, 2, 2) # plt.plot(voicedData(filename)) # plt.show() # print("Pitches frames", pitches.shape) pitchFeatureVector = np.array([]) ## soundData statistics ## Features 0 to 4 pitchFeatureVector = np.append(pitchFeatureVector, np.mean(pitches)) pitchFeatureVector = np.append(pitchFeatureVector, np.median(pitches)) pitchFeatureVector = np.append(pitchFeatureVector, np.max(pitches[np.nonzero(pitches)])) pitchFeatureVector = np.append(pitchFeatureVector, np.min(pitches[np.nonzero(pitches)])) pitchFeatureVector = np.append(pitchFeatureVector, np.var(pitches)) ## soundData Derivative statistics ## Features 5 to 9 pitchFeatureVector = np.append(pitchFeatureVector, np.mean(derivativeOfPitches)) pitchFeatureVector = np.append(pitchFeatureVector, np.median(derivativeOfPitches)) pitchFeatureVector = np.append( pitchFeatureVector, np.max(derivativeOfPitches[np.nonzero(derivativeOfPitches)])) pitchFeatureVector = np.append( pitchFeatureVector, np.min(derivativeOfPitches[np.nonzero(derivativeOfPitches)])) pitchFeatureVector = np.append(pitchFeatureVector, np.var(derivativeOfPitches)) # print(pitchFeatureVector.shape) '''-----------------------------------------------------PITCH FEATURES-----------------------------------------------------''' '''-----------------------------------------------------AVERAGE ENERGIES-----------------------------------------------------''' sr, sd = wav.read(filename) # boolVoiced = voicedData(sd, sr) frame = 50 #ms ## Finding the frame length for sound corresponding to the time frames frame = int(sr * (frame / 1000)) ## Finding the total number of frames of our sound totalFrames = sd.shape[0] / frame framesToCut = int(frame * floor(totalFrames)) ## Padding the overflowing valuesToPad = sd.shape[0] - framesToCut soundData = np.pad(sd, (0, frame - valuesToPad), 'constant') totalFrames = soundData.shape[0] / frame soundData = np.array(np.split(soundData, totalFrames)) # soundData = soundData[:-1] ## Average energies of soundData ## Have to ratio it according to voiced and unvoiced data ## Function prototype : librosa.feature.rmse(y=None, S=None, frame_length=2048, hop_length=512, center=True, pad_mode='reflect) # averageEnergies = np.mean(librosa.feature.rmse(y=soundData, hop_length=int(totalFrames), center=True, pad_mode='reflect').T, axis=0)[0] voicedEnergies = np.array([]) unvoicedEnergies = np.array([]) ''' for i in range(boolVoiced.shape[0]): if (boolVoiced[i] == 0): unvoicedEnergies = np.append(unvoicedEnergies, AFE.stEnergy(soundData[i])) else: voicedEnergies = np.append(voicedEnergies, AFE.stEnergy(soundData[i])) ## Feature 10 # print("voicedEnergies", voicedEnergies) voicedEnergies = np.mean(voicedEnergies) pitchFeatureVector = np.append(pitchFeatureVector, voicedEnergies) ## Feature 11 unvoicedEnergies = np.mean(unvoicedEnergies) # print("unvoicedEnergies", voicedEnergies) pitchFeatureVector = np.append(pitchFeatureVector, unvoicedEnergies) ## Checking for NaN values if (np.isnan(pitchFeatureVector[11])): pitchFeatureVector[11] = 0 #-----------------------------------------------------AVERAGE ENERGIES----------------------------------------------------- #------------------------------------------------------SPEAKING RATE------------------------------------------------------ ## Speaking rate of soundData (inverse of the average length of the voiced part of an utterance) voicedParts = np.array([]) # print(boolVoiced) LENGTH = 0 for i in range(boolVoiced.shape[0]): if (boolVoiced[i] == 1): LENGTH += 1 elif (LENGTH > 0 and boolVoiced[i] == 0): ## 50 because thats the frame length we are going with. voicedParts = np.append(voicedParts, LENGTH*(50/1000)) LENGTH = 0 if (LENGTH != 0): voicedParts = np.append(voicedParts, LENGTH*(50/1000)) LENGTH = 0 # print(voicedParts) ## Speaking rate made out to be in words per second # print(boolVoiced) speakingRate = 1 / (np.mean(voicedParts)) # print("Speaking rate :", speakingRate) ## Feature 12 pitchFeatureVector = np.append(pitchFeatureVector, speakingRate) #------------------------------------------------------SPEAKING RATE------------------------------------------------------ ''' return pitchFeatureVector