def extract_pitches(): for movie in Movie.select().where(Movie.skip_line == True, Movie.skip_snippet == True, Movie.skip_pitch == False): print "Extracting pitches for '%s'" % movie.title bar = progressbar.ProgressBar() for line in bar(movie.lines.where(Line.pitch == None)): with db.transaction(): try: signal = basic.SignalObj(_full_path(line.audio)) pitch = pYAAPT.yaapt(signal) t = pitch.frames_pos / signal.fs # Gaussian filter kern = sg.gaussian(20, 2) lp = sg.filtfilt(kern, np.sum(kern), pitch.samp_interp) lp[pitch.samp_values == 0] = np.nan line.pitch = np.vstack((t, lp)) except Exception as e: print e line.pitch = None line.save() movie.skip_pitch = True movie.save()
def YAAPT_fundamental_freq(signal, frame_length=40, tda_frame_length=40, f0_min=70, f0_max=600): pitchY = pYAAPT.yaapt(signal, frame_length=frame_length, tda_frame_length=tda_frame_length, f0_min=f0_min, f0_max=f0_max) data_YAAPT = pitchY.samp_values mean_freq = 0 x_total = 0 total = 0 for i in range(len(data_YAAPT)): if data_YAAPT[i] > 10: x_total += data_YAAPT[i] total += 1 mean_freq = x_total / total # mean_freq=mean_freq.astype(int) # mean_freq=int(mean_freq) # print('tyeper',type(mean_freq)) # return mean frequency of wave file return mean_freq
def get_pitch_decompy_values(wav, remove_silencess = True, interpolate = True): #print('get_pitch_decompy_values\npath = {}\n'.format(wav)) signal = basic.SignalObj(wav) ''' plt.title('signal') plt.plot(signal.data, color = 'm') ''' pitch = pYAAPT.yaapt(signal) ynew = pitch.samp_values rv = len(pitch.samp_values) sv = 0 iv = 0 # toma el pitch quita los silencios del inicio y el final y aplica # spline_interpolation para quitar rellenar por interpolacion los silencios intermedios if remove_silencess: ynew, _, _ = remove_silence(ynew) sv = len(ynew) #print(' y_before_remove_silence = {}'.format(len(pitch.samp_values))) #print(' y_to_spline_len = {}'.format(len(ynew))) if interpolate: #print(' interpolating') #_, _, ynew, _ = spline_interpolation(ynew) ynew = spline_interpolation(ynew) iv = len(ynew) return ynew
def extract_and_analyze_pitches(): for movie in Movie.select().where(Movie.skip_line == True, Movie.skip_snippet == True, Movie.skip_pitch == False): print "Extracting pitches for '%s'" % movie.title bar = progressbar.ProgressBar() for line in bar(movie.lines.where(Line.pitch == None)): with db.transaction(): try: signal = basic.SignalObj(_full_path(line.audio)) pitch = pYAAPT.yaapt(signal) # Gaussian filter kern = sg.gaussian(20, 2) lp = sg.filtfilt(kern, np.sum(kern), pitch.samp_interp) lp[pitch.samp_values == 0] = np.nan kde = st.gaussian_kde(lp[~np.isnan(lp)]) locs = np.linspace(50, 400, 100) vals = kde.evaluate(locs) peak = locs[np.argmax(vals)] line.sextimate = peak except Exception as e: print e line.pitch = None line.save() movie.skip_pitch = True movie.save()
def extractF0_wrapper(input_wav, min_f0=60, max_f0=600, frame_length=25, frame_shift=5): """extractF0_wrapper(input_wav, min_f0 = 60, max_f0 = 600, frame_length = 25, frame_shift = 5) input ----- input_wav: string, path to the input waveform min_f0: int, minimum F0 (default 60 Hz) max_f0: int, maximum F0 (default 600 Hz) frame_length, int, analysis frame length in ms (default 25ms) frame_shift: int, analysis frame shift in ms (default 5ms) output ------ f0: np.array, f0 sequence in shape [number_of_frame] """ if os.path.isfile(input_wav): signal = basic.SignalObj(input_wav) pitch = pYAAPT.yaapt( signal, **{ 'f0_min': min_f0, 'f0_max': max_f0, 'frame_length': frame_length, 'frame_space': frame_shift }) f0_value = pitch.samp_values f0_value = numpy.array(f0_value, dtype=numpy.float32) return f0_value else: print("Cannot find {:s}".format(input_wav)) return None
def extractF0(input_wav, output_f0, min_f0=60, max_f0=400, frame_length=35, frame_shift=10): if os.path.isfile(input_wav): signal = basic.SignalObj(input_wav) pitch = pYAAPT.yaapt( signal, **{ 'f0_min': min_f0, 'f0_max': max_f0, 'frame_length': frame_length, 'frame_space': frame_shift }) f0_value = pitch.samp_values datatype = numpy.dtype(('<f4', 1)) f0_value = numpy.asarray(f0_value, dtype=datatype) f = open(output_f0, 'wb') f0_value.tofile(f, '') f.close() print("F0 processed: %s" % (output_f0)) else: print("Cannot find %s" % (input_wav)) return
def audio_pitch(file_name, demo_path, media_root): # load audio source_path = '{media_root}/dtw/{file_name}.wav' source_path = source_path.format(media_root=media_root, file_name=file_name) signal_source = basic.SignalObj(source_path) signal_target = basic.SignalObj(demo_path) # YAAPT pitches pitches_source = pYAAPT.yaapt(signal_source, frame_length=40, tda_frame_length=40, fft_length=2048, f0_min=75, f0_max=600) pitches_target = pYAAPT.yaapt(signal_target, frame_length=40, tda_frame_length=40, fft_length=2048, f0_min=75, f0_max=600) # Main wav, fs = librosa.load(source_path, sr=None) output = numpy.full(shape=(len(wav)), fill_value=0, dtype='float32') length = 4096 for i in range(0, len(wav), int(length / 6)): # time: /10ms time = int(i / fs * 100) if (time < len(pitches_source.samp_values) and time < len(pitches_target.samp_values)): source_pitch = pitches_source.samp_values[time] target_pitch = pitches_target.samp_values[time] # 底数常量为相邻两个音高频率关系 # 例如:A3音符与A4的频率分别为220.0Hz,440.0Hz, 根据十二平均律已知两个音音阶差为12,设常量为t,频率关系则为 440 = 220 * t ** 12, t = pow(440 / 220, 1.0/12) n_steps = 0 if (source_pitch != 0 and target_pitch != 0): n_steps = math.log(target_pitch / source_pitch, pow(2, 1.0 / 12)) new_frame = librosa.effects.pitch_shift(y=numpy.hanning(len(wav[i:i + length])) * wav[i:i + length], sr=fs, n_steps=n_steps) output[i:i + length] += new_frame # 混响效果,听起来效果不是很好。 # fx = ( # AudioEffectsChain() # .highshelf() # .reverb() # # .phaser() # .lowshelf() # ) # output = fx(output) # 写入文件 output_path = '{media_root}/pitch/{file_name}.wav' output_path = output_path.format(media_root=media_root, file_name=file_name) librosa.output.write_wav(output_path, output, fs)
def preprocessing(ii): fname = nameNsize[ii][0] line = nameNsize[ii][1] phone = [] for item in line.split(' '): temp = G2P(item) phone += temp.split() if item.find('!') != -1: phone += '!' elif item.find('?') != -1: phone += '?' elif item.find('.') != -1: phone += '.' elif item.find(',') != -1: phone += ',' phone += ' ' text = phone[:-1] + ['E'] dic = [ 'P', '.', '!', '?', ',', 'k0', 'kk', 'nn', 't0', 'tt', 'rr', 'mm', 'p0', 'pp', 's0', 'ss', 'oh', 'c0', 'cc', 'ch', 'kh', 'th', 'ph', 'h0', 'aa', 'qq', 'ya', 'yq', 'vv', 'ee', 'yv', 'ye', 'oo', 'wa', 'wq', 'wo', 'yo', 'uu', 'wv', 'we', 'wi', 'yu', 'xx', 'xi', 'ii', '', 'kf', 'ks', 'nf', 'nc', 'ng', 'nh', 'tf', 'll', 'lk', 'lm', 'lb', 'ls', 'lt', 'lp', 'lh', 'mf', 'pf', 'ps', ' ', 'E' ] char2idx = {ch: idx for idx, ch in enumerate(dic)} emblen = len(char2idx) txt = np.asarray([char2idx[ch] for ch in text]) audio, sr = librosa.load('../kss/{}'.format(fname), sr=22050) audio, index = librosa.effects.trim(audio, top_db=43, frame_length=256, hop_length=64) audioobj = basic.SignalObj(audio, sr) pitch = pYAAPT.yaapt( audioobj, **{ 'f0_min': 100.0, 'frame_length': 1000 * 1024 // 22050, 'frame_space': 1000 * 256 // 22050 }) pitch = pitch.samp_values stft = np.abs( librosa.stft(audio, n_fft=1024, hop_length=256, win_length=1024)) stft = np.power(stft / np.max(stft), 0.6) mel_filters = librosa.filters.mel(22050, 1024, 80) mel = np.dot(mel_filters, stft) mel = np.power(mel / np.max(mel), 0.6) mel = mel[:, ::4] pitch = pitch[::4] length = np.min([np.shape(mel)[1], np.shape(stft)[1] // 4]) mel = mel[:, :length] stft = stft[:, :length * 4] pitch = pitch[:length] np.save('data/sample_{}.npy'.format((str)(ii).zfill(5)), (txt, len(txt), mel, stft, mel.shape[1], pitch)) print('\r{}saved'.format(ii), end='')
def getf0_viaYAAPT(music_loc): signal = basic.SignalObj(music_loc) pitch = pYAAPT.yaapt(signal, **{'f0_max': 2600.0}) pitch_interp = pitch.samp_interp pitch_interp = lpf(pitch_interp, 15, 100) pitch_interp = medfilt(pitch_interp, 45) return pitch_interp.tolist()
def extract_pitch(path): """ Method to extract pitch values and energy :param path: path to the audio file :return: pitch values and pitch energy, averaged over number of frames """ signal = basic.SignalObj(path) pitch = pYAAPT.yaapt(signal) avg_pitch = sum(map(np.array, pitch.samp_values)) / pitch.nframes avg_pitch_energy = sum(map(np.array, pitch.energy)) / pitch.nframes return avg_pitch, avg_pitch_energy
def __init__(self, inputSound, targetSound): self.inputSound = basic.SignalObj(inputSound) self.inputFs = self.inputSound.fs #self.inputPitch = pyaapt.yaapt(self.inputSound, **{'f0-min': 75.0, 'f0-max': 500.0, 'frame_length':15.0, 'frame_space': 10.0}) self.inputPitch = pyaapt.yaapt(self.inputSound) self.inputF0 = self.inputPitch.values self.targetF0 = self.avgTargetF0(targetSound) self.targetFs, self.targetSound = wavfile.read(targetSound) self.resampleInputF0() self.na = None self.relateF0() self.zeroThresh = 5 #number of 0s in na factor array to count as a break in between input sound words
def convert_wav_to_f0_ascii(fname, fs, directory=''): print 'convert ', fname # Create the signal object. signal = basic.SignalObj(fname) # Get time interval and num_samples t_start = 0.0 num_samples = signal.size t_end = num_samples / signal.fs t = np.linspace(t_start, t_end, num_samples) # Create the pitch object and calculate its attributes. pitch = pyaapt.yaapt(signal) # Create .f0_ascii file and dump f0 to it output_fname = directory+os.path.splitext(os.path.basename(fname))[0]+'.f0_ascii' with open(output_fname, 'wb') as f: for i in range(pitch.nframes): f0 = pitch.samp_values[i] vu = 1.0 if pitch.vuv[i] else 0.0 fe = pitch.energy[i] * pitch.mean_energy line = '{} {} {} {}\n'.format(f0, vu, fe, vu) f.write(line) step = signal.fs / fs output_f0 = pitch.values_interp[0:signal.size:step] return (os.path.splitext(os.path.basename(fname))[0], output_f0)
def get_f0(audio, rate=16000): try: import amfm_decompy.basic_tools as basic import amfm_decompy.pYAAPT as pYAAPT from librosa.util import normalize except ImportError: raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)." assert audio.ndim == 1 frame_length = 20.0 # ms to_pad = int(frame_length / 1000 * rate) // 2 audio = normalize(audio) * 0.95 audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0) audio = basic.SignalObj(audio, rate) pitch = pYAAPT.yaapt( audio, frame_length=frame_length, frame_space=F0_FRAME_SPACE * 1000, nccf_thresh1=0.25, tda_frame_length=25.0, ) f0 = pitch.samp_values return f0
import os.path # Declare the variables. file_name = os.path.dirname(amfm_decompy.__file__)+os.sep+"sample.wav" window_duration = 0.015 # in seconds nharm_max = 25 SNR = float('Inf') # Create the signal object. signal = basic.SignalObj(file_name) # Create the window object. window = pyqhm.SampleWindow(window_duration, signal.fs) # Create the pitch object and calculate its attributes. pitch = pyaapt.yaapt(signal) # Set the number of modulated components. signal.set_nharm(pitch.values, nharm_max) # Check if gaussian noise has to be added. if SNR != float('Inf'): signal.noiser(pitch.values, SNR) # Perform the QHM extraction. QHM = pyqhm.qhm(signal, pitch, window, 0.001, N_iter = 3, phase_tech = 'phase') print ("QHM SRER: %s" % (QHM.SRER)) # Perform the aQHM extraction. aQHM = pyqhm.aqhm(signal, QHM, pitch, window, 0.001, N_iter = 3, N_runs = 2,
def voiceRecognition(): def int_or_str(text): """Helper function for argument parsing.""" try: return int(text) except ValueError: return text parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-l', '--list-devices', action='store_true', help='show list of audio devices and exit') parser.add_argument('-d', '--device', type=int_or_str, help='input device (numeric ID or substring)') parser.add_argument('-r', '--samplerate', type=int, help='sampling rate') parser.add_argument('-c', '--channels', type=int, default=1, help='number of input channels') parser.add_argument('filename', nargs='?', metavar='FILENAME', help='audio file to store recording to') parser.add_argument('-t', '--subtype', type=str, help='sound file subtype (e.g. "PCM_24")') args = parser.parse_args() # AUDIO_CAPTURING import sounddevice as sd import soundfile as sf import numpy # Make sure NumPy is loaded before it is used in the callback assert numpy # avoid "imported but unused" message (W0611) if args.list_devices: print(sd.query_devices()) parser.exit(0) if args.samplerate is None: device_info = sd.query_devices(args.device, 'input') # soundfile expects an int, sounddevice provides a float: args.samplerate = int(device_info['default_samplerate']) if args.filename is None: args.filename = tempfile.mktemp(prefix='candidate_recording', suffix='.wav', dir='') q = queue.Queue() def callback(indata, frames, time, status): """This is called (from a separate thread) for each audio block.""" if status: print(status, file=sys.stderr) q.put(indata.copy()) # Make sure the file is opened before recording anything: with sf.SoundFile(args.filename, mode='x', samplerate=args.samplerate, channels=args.channels, subtype=args.subtype) as file: with sd.InputStream(samplerate=args.samplerate, device=args.device, channels=args.channels, callback=callback): print('#' * 80) print('press q to stop the recording') print('#' * 80) # while True: # file.write(q.get()) i = 0 while True: # making a loop #print(i) i = i + 1 file.write(q.get()) try: # used try so that if user pressed other than the given key error will not be shown if keyboard.is_pressed('q'): # if key 'q' is pressed print('You Pressed A Key!') break # finishing the loop else: pass except: break # except KeyboardInterrupt: print('\nRecording finished: ' + repr(args.filename)) #Python program to transcribe an Audio file AUDIO_FILE = (args.filename) # use the audio file as the audio source r = sr.Recognizer() with sr.AudioFile(AUDIO_FILE) as source: #reads the audio file. Here we use record instead of #listen audio = r.record(source) try: text = r.recognize_google(audio) print("The audio file contains: " + text) except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print( "Could not request results from Google Speech Recognition service; {0}" .format(e)) # write audio to a WAV file with open("microphone-results-11223344.wav", "wb") as f: f.write(audio.get_wav_data()) # write the converted text to a TXT file t = open('microphone-results-11223344.txt', 'a') t.write(text) t.close() # PITCH_TRACKING signal = basic.SignalObj('microphone-results-11223344.wav') pitch = pYAAPT.yaapt(signal) #print(pitch.samp_values) print(len(pitch.samp_values)) non_zero_pitch = [] for i in range(len(pitch.samp_values)): if pitch.samp_values[i] > 0: non_zero_pitch.append(pitch.samp_values[i]) print("*****************************") #print(non_zero_pitch) print(len(non_zero_pitch)) high = [] low = [] for i in range(len(non_zero_pitch)): if non_zero_pitch[i] > 255: high.append(non_zero_pitch[i]) elif non_zero_pitch[i] < 85: low.append(non_zero_pitch[i]) avg_pitch = np.mean(non_zero_pitch) print("The average pitch value is: ", avg_pitch) if 85 <= avg_pitch <= 255: print("Appropriate Pitch Maintained", len(high), len(low)) # GAPS_IN_AUDIO AudioSegment.converter = r"C:\\ffmpeg\\bin\\ffmpeg.exe" myaudio = AudioSegment.from_wav("microphone-results-11223344.wav") silent = silence.detect_silence(myaudio, min_silence_len=100, silence_thresh=-40) silent = [((start / 1000), (stop / 1000)) for start, stop in silent] #convert to sec print("************************") print(silent) silent = np.asarray(silent) print(silent) print(silent.shape) diff = [] count = 0 for i in range(len(silent)): sub = silent[i][1] - silent[i][0] diff.append(sub) for i in range(len(diff)): if diff[i] > 1.3: count += 1 print("Gaps greater than 1.3 seconds: ", count, " times") # POLARITY_CALCULATION f = open("microphone-results-11223344.txt", "r") if f.mode == 'r': contents = f.read() blob = TextBlob(contents) print("The Polarity of the recorded transcript is: ") for sentence in blob.sentences: print(sentence.sentiment.polarity) # SPEECH_RATE num_words = 0 with open("microphone-results-11223344.txt", 'r') as f: for line in f: words = line.split() num_words += len(words) print("Number of words:", num_words) data_, sampling_rate_ = librosa.load("microphone-results-11223344.wav", sr=44100) secs = np.size(data_) / sampling_rate_ print("Audio Length: ", str(secs), " seconds") silent_zones = np.sum(diff) eff_diff = secs - silent_zones print("Effective non-silent time period is: ", eff_diff) speech_rate = math.ceil((num_words / eff_diff) * 60) print("Speech rate is {} words per minute".format(speech_rate)) if speech_rate < 110: print("Not a good speech rate: ", speech_rate) elif speech_rate >= 110 and speech_rate <= 165: print("Perfect speech rate: ", speech_rate) else: print("Very fast, either nervous or too excited: ", speech_rate) parser.exit(0)
from schema import Line import matplotlib.pyplot as plt import numpy as np from scipy import signal as sg from scipy import stats as st plt.style.use('ggplot') import amfm_decompy.pYAAPT as pYAAPT import amfm_decompy.basic_tools as basic lines = Line.select().where(Line.id << sys.argv[1:]) for line in lines: signal = basic.SignalObj(line.audio) pitch = pYAAPT.yaapt(signal) t = pitch.frames_pos / signal.fs p = pitch.samp_interp p[p == 0] = np.nan fig, (a,b) = plt.subplots(1, 2, sharey=True, num="#%i: " % (line.id) \ + line.text.replace('\n', ' '), figsize=(10,5), ) #a.plot(t, p, lw=1) a.set_ylim(50, 400) a.set_xlim(right=np.max(t))
'dec_factor': 1, 'nccf_thresh1': 0.3, 'nccf_thresh2': 0.9, 'nccf_maaxcands': 3, 'nccf_pwidth': 5, # 5 'merit_boost': 5, 'merit_pivot': 0.20, 'merit_extra': 0.4, 'median_value': 7, 'dp_w1': 0.15, 'dp_w2': 0.5, 'dp_w3': 100, 'dp_w4': 0.9 } pitch = pYAAPT.yaapt(signal, **params) frames = preprocessing.frame(silence_remove, frame_length, frame_overlap) f, t, stft = fea.stft(silence_remove, pic=None, fs=sample_rate, nperseg=frame_length, noverlap=frame_overlap, nfft=8192, padded=True, boundary=None) f,t,stft = scipy.signal.stft(x=silence_remove, fs=sample_rate, window='hann', nperseg=frame_length, noverlap=frame_overlap, nfft=8192, detrend=False, return_onesided=True, boundary='zeros', padded=True, axis=-1) print(pitch.samp_values.shape[0], frames.shape[1]) for i in range(min(pitch.samp_values.shape[0], frames.shape[1])): plt.figure() plt.subplot(211) X, _ = np.abs(fea.fft_singleside(x=frames[:,i], fs=sample_rate, n=8192, pic=None)) plt.plot(np.arange(0, 8192/2+1), np.abs(stft[:,i]), 'y') plt.axvline(pitch.samp_interp[i], c='b') plt.axvline(pitch.samp_values[i], c='g') plt.subplot(212) plt.plot(np.arange(0, 8192 / 2 + 1), X, 'r')
speakers = ['awb','bdl','clb','jmk','ksp','rms','slt'] root = os.getcwd() folderpath = os.path.join(root,'datasets',speakers[0],'wav') files = sorted(os.listdir(folderpath)) # Read the files for file in files: file = os.path.join(folderpath,file) fs,audio = wavread(file) break # IPython.display.Audio(file) # YAAPT pitches signal = basic.SignalObj(file) pitchY = pYAAPT.yaapt(signal, frame_length=25, frame_space=5, f0_min=40, f0_max=300) plt.plot(pitchY.values_interp, label='YAAPT', color='blue') plt.xlabel('samples') plt.ylabel('pitch (Hz)')' #
def get_pitch_decompy_int(wav): signal = basic.SignalObj(wav) pitch = pYAAPT.yaapt(signal) return remove_silence(pitch.samp_interp)
def preparePitchFeatureVector(filename): ## Find a better way to get pitches for our audio clip ## Function Prototype : pysptk.sptk.rapt(x, fs, hopsize, min=60, max=240, voice_bias=0.0, otype='f0') ## The current is removing zeroed (unvoiced) pitches ## Still have to understand voiced and unvoiced data from audio # pitches = pysptk.sptk.rapt(x=soundData, fs=sampleRate, hopsize=totalFrames, min=60, max=250, voice_bias=0.5, otype="f0") # pitches = pitches[np.nonzero(pitches)] '''-----------------------------------------------------PITCH FEATURES-----------------------------------------------------''' ## Example : pitch = pYAAPT.yaapt(signal, **{'f0_min' : 150.0, 'frame_length' : 15.0, 'frame_space' : 5.0}) signal = basic.SignalObj(filename) pitch = pYAAPT.yaapt( signal, **{ 'f0_min': 60.0, 'f0_max': 360, 'frame_length': 25.0, 'frame_space': 10.0 }) pitches = pitch.samp_values testSampleRate, testSoundData = wav.read(filename) getPitchesForEachFrame(testSoundData, testSampleRate) ## Getting the voiced part of the sound clip boolVoiced = np.array([]) for i in range(len(pitches)): if (pitches[i] == 0): boolVoiced = np.append(boolVoiced, 0) else: boolVoiced = np.append(boolVoiced, 1) derivativeOfPitches = np.array([]) counter = 0 for i in pitches: if counter == 0: delta = i derivativeOfPitches = np.append(derivativeOfPitches, delta) else: delta = i - prev derivativeOfPitches = np.append(derivativeOfPitches, delta) prev = i counter += 1 derivativeOfPitches = np.array( np.split(derivativeOfPitches, pitches.shape[0])) # print(boolVoiced) # sampleRate, soundData = wav.read(filename) # plt.subplot(2, 2, 1) # plt.plot(soundData) # plt.subplot(2, 2, 3) # plt.plot(pitch.samp_values) # plt.subplot(2, 2, 4) # plt.plot(boolVoiced) # plt.subplot(2, 2, 2) # plt.plot(voicedData(filename)) # plt.show() # print("Pitches frames", pitches.shape) pitchFeatureVector = np.array([]) ## soundData statistics ## Features 0 to 4 pitchFeatureVector = np.append(pitchFeatureVector, np.mean(pitches)) pitchFeatureVector = np.append(pitchFeatureVector, np.median(pitches)) pitchFeatureVector = np.append(pitchFeatureVector, np.max(pitches[np.nonzero(pitches)])) pitchFeatureVector = np.append(pitchFeatureVector, np.min(pitches[np.nonzero(pitches)])) pitchFeatureVector = np.append(pitchFeatureVector, np.var(pitches)) ## soundData Derivative statistics ## Features 5 to 9 pitchFeatureVector = np.append(pitchFeatureVector, np.mean(derivativeOfPitches)) pitchFeatureVector = np.append(pitchFeatureVector, np.median(derivativeOfPitches)) pitchFeatureVector = np.append( pitchFeatureVector, np.max(derivativeOfPitches[np.nonzero(derivativeOfPitches)])) pitchFeatureVector = np.append( pitchFeatureVector, np.min(derivativeOfPitches[np.nonzero(derivativeOfPitches)])) pitchFeatureVector = np.append(pitchFeatureVector, np.var(derivativeOfPitches)) # print(pitchFeatureVector.shape) '''-----------------------------------------------------PITCH FEATURES-----------------------------------------------------''' '''-----------------------------------------------------AVERAGE ENERGIES-----------------------------------------------------''' sr, sd = wav.read(filename) # boolVoiced = voicedData(sd, sr) frame = 50 #ms ## Finding the frame length for sound corresponding to the time frames frame = int(sr * (frame / 1000)) ## Finding the total number of frames of our sound totalFrames = sd.shape[0] / frame framesToCut = int(frame * floor(totalFrames)) ## Padding the overflowing valuesToPad = sd.shape[0] - framesToCut soundData = np.pad(sd, (0, frame - valuesToPad), 'constant') totalFrames = soundData.shape[0] / frame soundData = np.array(np.split(soundData, totalFrames)) # soundData = soundData[:-1] ## Average energies of soundData ## Have to ratio it according to voiced and unvoiced data ## Function prototype : librosa.feature.rmse(y=None, S=None, frame_length=2048, hop_length=512, center=True, pad_mode='reflect) # averageEnergies = np.mean(librosa.feature.rmse(y=soundData, hop_length=int(totalFrames), center=True, pad_mode='reflect').T, axis=0)[0] voicedEnergies = np.array([]) unvoicedEnergies = np.array([]) ''' for i in range(boolVoiced.shape[0]): if (boolVoiced[i] == 0): unvoicedEnergies = np.append(unvoicedEnergies, AFE.stEnergy(soundData[i])) else: voicedEnergies = np.append(voicedEnergies, AFE.stEnergy(soundData[i])) ## Feature 10 # print("voicedEnergies", voicedEnergies) voicedEnergies = np.mean(voicedEnergies) pitchFeatureVector = np.append(pitchFeatureVector, voicedEnergies) ## Feature 11 unvoicedEnergies = np.mean(unvoicedEnergies) # print("unvoicedEnergies", voicedEnergies) pitchFeatureVector = np.append(pitchFeatureVector, unvoicedEnergies) ## Checking for NaN values if (np.isnan(pitchFeatureVector[11])): pitchFeatureVector[11] = 0 #-----------------------------------------------------AVERAGE ENERGIES----------------------------------------------------- #------------------------------------------------------SPEAKING RATE------------------------------------------------------ ## Speaking rate of soundData (inverse of the average length of the voiced part of an utterance) voicedParts = np.array([]) # print(boolVoiced) LENGTH = 0 for i in range(boolVoiced.shape[0]): if (boolVoiced[i] == 1): LENGTH += 1 elif (LENGTH > 0 and boolVoiced[i] == 0): ## 50 because thats the frame length we are going with. voicedParts = np.append(voicedParts, LENGTH*(50/1000)) LENGTH = 0 if (LENGTH != 0): voicedParts = np.append(voicedParts, LENGTH*(50/1000)) LENGTH = 0 # print(voicedParts) ## Speaking rate made out to be in words per second # print(boolVoiced) speakingRate = 1 / (np.mean(voicedParts)) # print("Speaking rate :", speakingRate) ## Feature 12 pitchFeatureVector = np.append(pitchFeatureVector, speakingRate) #------------------------------------------------------SPEAKING RATE------------------------------------------------------ ''' return pitchFeatureVector
import amfm_decompy.basic_tools as basic import amfm_decompy.pYAAPT as pYAAPT import matplotlib.pyplot as plt import numpy as np import sys if __name__ == "__main__": # load audio print(sys.argv[1]) filename = sys.argv[1] signal = basic.SignalObj(filename) # YAAPT pitches pitchY = pYAAPT.yaapt(signal, frame_length=40, tda_frame_length=40, f0_min=75, f0_max=600) #get values val = pitchY.values_interp pred_UPDRS = 15.82 - 0.376 * np.median( val) + 0.305 * val.mean() - .024 * val.std() - .005 * val.max() plt.plot(val) plt.xlabel('Nanoseconds') plt.ylabel('Pitch (hz)') plt.title('Pitch over time') plt.savefig('person_name.png') print('$') print(pred_UPDRS)
import os.path # Declare the variables. file_name = os.path.dirname(amfm_decompy.__file__) + os.sep + "sample.wav" window_duration = 0.015 # in seconds nharm_max = 25 SNR = float('Inf') # Create the signal object. signal = basic.SignalObj(file_name) # Create the window object. window = pyqhm.SampleWindow(window_duration, signal.fs) # Create the pitch object and calculate its attributes. pitch = pyaapt.yaapt(signal) # Set the number of modulated components. signal.set_nharm(pitch.values, nharm_max) # Check if gaussian noise has to be added. if SNR != float('Inf'): signal.noiser(pitch.values, SNR) # Perform the QHM extraction. QHM = pyqhm.qhm(signal, pitch, window, 0.001, N_iter=3, phase_tech='phase') print("QHM SRER: {}".format(QHM.SRER)) # Perform the aQHM extraction. aQHM = pyqhm.aqhm(signal,
def avgTargetF0(self, targetSound): ipt = basic.SignalObj(targetSound) pch = pyaapt.yaapt(ipt, **{'frame_length':30.0, 'f0-min': 10.0, 'f0-max': 300.0, 'frame_space': 20.0}) nonZero = filter(lambda x: x > 0, pch.values) print(np.mean(nonZero)) return np.mean(nonZero)