Exemplo n.º 1
0
def extract_pitches():
    for movie in Movie.select().where(Movie.skip_line == True,
                                      Movie.skip_snippet == True,
                                      Movie.skip_pitch == False):

        print "Extracting pitches for '%s'" % movie.title

        bar = progressbar.ProgressBar()
        for line in bar(movie.lines.where(Line.pitch == None)):
            with db.transaction():

                try:
                    signal = basic.SignalObj(_full_path(line.audio))
                    pitch = pYAAPT.yaapt(signal)

                    t = pitch.frames_pos / signal.fs

                    # Gaussian filter
                    kern = sg.gaussian(20, 2)
                    lp = sg.filtfilt(kern, np.sum(kern), pitch.samp_interp)
                    lp[pitch.samp_values == 0] = np.nan

                    line.pitch = np.vstack((t, lp))
                except Exception as e:
                    print e
                    line.pitch = None

                line.save()

        movie.skip_pitch = True
        movie.save()
Exemplo n.º 2
0
def YAAPT_fundamental_freq(signal,
                           frame_length=40,
                           tda_frame_length=40,
                           f0_min=70,
                           f0_max=600):
    pitchY = pYAAPT.yaapt(signal,
                          frame_length=frame_length,
                          tda_frame_length=tda_frame_length,
                          f0_min=f0_min,
                          f0_max=f0_max)
    data_YAAPT = pitchY.samp_values
    mean_freq = 0
    x_total = 0
    total = 0
    for i in range(len(data_YAAPT)):

        if data_YAAPT[i] > 10:
            x_total += data_YAAPT[i]
            total += 1

    mean_freq = x_total / total
    # mean_freq=mean_freq.astype(int)
    # mean_freq=int(mean_freq)
    # print('tyeper',type(mean_freq))

    # return mean frequency of wave file
    return mean_freq
def get_pitch_decompy_values(wav, remove_silencess = True, interpolate = True):
    #print('get_pitch_decompy_values\npath = {}\n'.format(wav))
    signal = basic.SignalObj(wav)
    '''
    plt.title('signal')
    plt.plot(signal.data, color = 'm')
    '''
    pitch = pYAAPT.yaapt(signal)
    
    ynew = pitch.samp_values

    rv = len(pitch.samp_values)
    sv = 0
    iv = 0
    
    # toma el pitch quita los silencios del inicio y el final y aplica
    # spline_interpolation para quitar rellenar por interpolacion los silencios intermedios
    if remove_silencess:
        ynew, _, _ =  remove_silence(ynew)
        sv = len(ynew)

    #print('  y_before_remove_silence = {}'.format(len(pitch.samp_values)))
    #print('  y_to_spline_len = {}'.format(len(ynew)))
    if interpolate:
        #print('     interpolating')
        #_, _, ynew, _ = spline_interpolation(ynew)
        ynew = spline_interpolation(ynew)
        iv = len(ynew)
    
    return ynew
Exemplo n.º 4
0
def extract_and_analyze_pitches():
    for movie in Movie.select().where(Movie.skip_line == True,
                                      Movie.skip_snippet == True,
                                      Movie.skip_pitch == False):

        print "Extracting pitches for '%s'" % movie.title

        bar = progressbar.ProgressBar()
        for line in bar(movie.lines.where(Line.pitch == None)):
            with db.transaction():

                try:
                    signal = basic.SignalObj(_full_path(line.audio))
                    pitch = pYAAPT.yaapt(signal)

                    # Gaussian filter
                    kern = sg.gaussian(20, 2)
                    lp = sg.filtfilt(kern, np.sum(kern), pitch.samp_interp)
                    lp[pitch.samp_values == 0] = np.nan

                    kde = st.gaussian_kde(lp[~np.isnan(lp)])
                    locs = np.linspace(50, 400, 100)
                    vals = kde.evaluate(locs)

                    peak = locs[np.argmax(vals)]
                    line.sextimate = peak

                except Exception as e:
                    print e
                    line.pitch = None

                line.save()

        movie.skip_pitch = True
        movie.save()
Exemplo n.º 5
0
def extractF0_wrapper(input_wav,
                      min_f0=60,
                      max_f0=600,
                      frame_length=25,
                      frame_shift=5):
    """extractF0_wrapper(input_wav, min_f0 = 60, max_f0 = 600, 
                      frame_length = 25, frame_shift = 5)
    
    input
    -----
      input_wav: string, path to the input waveform
      min_f0: int, minimum F0 (default 60 Hz)
      max_f0: int, maximum F0 (default 600 Hz)
      frame_length, int, analysis frame length in ms (default 25ms)
      frame_shift: int, analysis frame shift in ms (default 5ms)
    
    output
    ------
      f0: np.array, f0 sequence in shape [number_of_frame]
    """
    if os.path.isfile(input_wav):
        signal = basic.SignalObj(input_wav)
        pitch = pYAAPT.yaapt(
            signal, **{
                'f0_min': min_f0,
                'f0_max': max_f0,
                'frame_length': frame_length,
                'frame_space': frame_shift
            })
        f0_value = pitch.samp_values
        f0_value = numpy.array(f0_value, dtype=numpy.float32)
        return f0_value
    else:
        print("Cannot find {:s}".format(input_wav))
        return None
Exemplo n.º 6
0
def extractF0(input_wav,
              output_f0,
              min_f0=60,
              max_f0=400,
              frame_length=35,
              frame_shift=10):
    if os.path.isfile(input_wav):
        signal = basic.SignalObj(input_wav)
        pitch = pYAAPT.yaapt(
            signal, **{
                'f0_min': min_f0,
                'f0_max': max_f0,
                'frame_length': frame_length,
                'frame_space': frame_shift
            })
        f0_value = pitch.samp_values
        datatype = numpy.dtype(('<f4', 1))
        f0_value = numpy.asarray(f0_value, dtype=datatype)

        f = open(output_f0, 'wb')
        f0_value.tofile(f, '')
        f.close()
        print("F0 processed: %s" % (output_f0))
    else:
        print("Cannot find %s" % (input_wav))
    return
Exemplo n.º 7
0
def audio_pitch(file_name, demo_path, media_root):
    # load audio
    source_path = '{media_root}/dtw/{file_name}.wav'
    source_path = source_path.format(media_root=media_root, file_name=file_name)
    signal_source = basic.SignalObj(source_path)
    signal_target = basic.SignalObj(demo_path)

    # YAAPT pitches
    pitches_source = pYAAPT.yaapt(signal_source, frame_length=40, tda_frame_length=40, fft_length=2048, f0_min=75,
                                  f0_max=600)
    pitches_target = pYAAPT.yaapt(signal_target, frame_length=40, tda_frame_length=40, fft_length=2048, f0_min=75,
                                  f0_max=600)
    # Main
    wav, fs = librosa.load(source_path, sr=None)
    output = numpy.full(shape=(len(wav)), fill_value=0, dtype='float32')
    length = 4096
    for i in range(0, len(wav), int(length / 6)):
        # time: /10ms
        time = int(i / fs * 100)
        if (time < len(pitches_source.samp_values) and time < len(pitches_target.samp_values)):
            source_pitch = pitches_source.samp_values[time]
            target_pitch = pitches_target.samp_values[time]
            # 底数常量为相邻两个音高频率关系
            # 例如:A3音符与A4的频率分别为220.0Hz,440.0Hz, 根据十二平均律已知两个音音阶差为12,设常量为t,频率关系则为 440 = 220 * t ** 12, t = pow(440 / 220, 1.0/12)
            n_steps = 0
            if (source_pitch != 0 and target_pitch != 0):
                n_steps = math.log(target_pitch / source_pitch, pow(2, 1.0 / 12))
            new_frame = librosa.effects.pitch_shift(y=numpy.hanning(len(wav[i:i + length])) * wav[i:i + length], sr=fs,
                                                    n_steps=n_steps)
            output[i:i + length] += new_frame

    # 混响效果,听起来效果不是很好。
    # fx = (
    #     AudioEffectsChain()
    #     .highshelf()
    #     .reverb()
    #     # .phaser()
    #     .lowshelf()
    # )
    # output = fx(output)

    # 写入文件
    output_path = '{media_root}/pitch/{file_name}.wav'
    output_path = output_path.format(media_root=media_root, file_name=file_name)
    librosa.output.write_wav(output_path, output, fs)
Exemplo n.º 8
0
def preprocessing(ii):
    fname = nameNsize[ii][0]
    line = nameNsize[ii][1]
    phone = []
    for item in line.split(' '):
        temp = G2P(item)
        phone += temp.split()
        if item.find('!') != -1:
            phone += '!'
        elif item.find('?') != -1:
            phone += '?'
        elif item.find('.') != -1:
            phone += '.'
        elif item.find(',') != -1:
            phone += ','
        phone += ' '
    text = phone[:-1] + ['E']
    dic = [
        'P', '.', '!', '?', ',', 'k0', 'kk', 'nn', 't0', 'tt', 'rr', 'mm',
        'p0', 'pp', 's0', 'ss', 'oh', 'c0', 'cc', 'ch', 'kh', 'th', 'ph', 'h0',
        'aa', 'qq', 'ya', 'yq', 'vv', 'ee', 'yv', 'ye', 'oo', 'wa', 'wq', 'wo',
        'yo', 'uu', 'wv', 'we', 'wi', 'yu', 'xx', 'xi', 'ii', '', 'kf', 'ks',
        'nf', 'nc', 'ng', 'nh', 'tf', 'll', 'lk', 'lm', 'lb', 'ls', 'lt', 'lp',
        'lh', 'mf', 'pf', 'ps', ' ', 'E'
    ]
    char2idx = {ch: idx for idx, ch in enumerate(dic)}
    emblen = len(char2idx)
    txt = np.asarray([char2idx[ch] for ch in text])
    audio, sr = librosa.load('../kss/{}'.format(fname), sr=22050)
    audio, index = librosa.effects.trim(audio,
                                        top_db=43,
                                        frame_length=256,
                                        hop_length=64)
    audioobj = basic.SignalObj(audio, sr)
    pitch = pYAAPT.yaapt(
        audioobj, **{
            'f0_min': 100.0,
            'frame_length': 1000 * 1024 // 22050,
            'frame_space': 1000 * 256 // 22050
        })
    pitch = pitch.samp_values
    stft = np.abs(
        librosa.stft(audio, n_fft=1024, hop_length=256, win_length=1024))
    stft = np.power(stft / np.max(stft), 0.6)
    mel_filters = librosa.filters.mel(22050, 1024, 80)
    mel = np.dot(mel_filters, stft)
    mel = np.power(mel / np.max(mel), 0.6)
    mel = mel[:, ::4]
    pitch = pitch[::4]
    length = np.min([np.shape(mel)[1], np.shape(stft)[1] // 4])
    mel = mel[:, :length]
    stft = stft[:, :length * 4]
    pitch = pitch[:length]
    np.save('data/sample_{}.npy'.format((str)(ii).zfill(5)),
            (txt, len(txt), mel, stft, mel.shape[1], pitch))
    print('\r{}saved'.format(ii), end='')
Exemplo n.º 9
0
def getf0_viaYAAPT(music_loc):

    signal = basic.SignalObj(music_loc)
    pitch = pYAAPT.yaapt(signal, **{'f0_max': 2600.0})

    pitch_interp = pitch.samp_interp
    pitch_interp = lpf(pitch_interp, 15, 100)
    pitch_interp = medfilt(pitch_interp, 45)

    return pitch_interp.tolist()
Exemplo n.º 10
0
def extract_pitch(path):
    """
    Method to extract pitch values and energy
    :param path: path to the audio file
    :return: pitch values and pitch energy, averaged over number of frames
    """
    signal = basic.SignalObj(path)
    pitch = pYAAPT.yaapt(signal)
    avg_pitch = sum(map(np.array, pitch.samp_values)) / pitch.nframes
    avg_pitch_energy = sum(map(np.array, pitch.energy)) / pitch.nframes
    return avg_pitch, avg_pitch_energy
Exemplo n.º 11
0
	def __init__(self, inputSound, targetSound):
		self.inputSound = basic.SignalObj(inputSound)
		self.inputFs = self.inputSound.fs
		#self.inputPitch = pyaapt.yaapt(self.inputSound, **{'f0-min': 75.0, 'f0-max': 500.0, 'frame_length':15.0, 'frame_space': 10.0})
		self.inputPitch = pyaapt.yaapt(self.inputSound)
		self.inputF0 = self.inputPitch.values
		self.targetF0 = self.avgTargetF0(targetSound)
		self.targetFs, self.targetSound = wavfile.read(targetSound)
		self.resampleInputF0()
		self.na = None
		self.relateF0()
		self.zeroThresh = 5 #number of 0s in na factor array to count as a break in between input sound words
Exemplo n.º 12
0
def convert_wav_to_f0_ascii(fname, fs, directory=''):
    print 'convert ', fname
    # Create the signal object.
    signal = basic.SignalObj(fname)
    # Get time interval and num_samples
    t_start = 0.0
    num_samples = signal.size
    t_end = num_samples / signal.fs
    t = np.linspace(t_start, t_end, num_samples)
    # Create the pitch object and calculate its attributes.
    pitch = pyaapt.yaapt(signal)
    # Create .f0_ascii file and dump f0 to it
    output_fname = directory+os.path.splitext(os.path.basename(fname))[0]+'.f0_ascii'
    with open(output_fname, 'wb') as f:
        for i in range(pitch.nframes):
            f0 = pitch.samp_values[i]
            vu = 1.0 if pitch.vuv[i] else 0.0
            fe = pitch.energy[i] * pitch.mean_energy
            line = '{} {} {} {}\n'.format(f0, vu, fe, vu)
            f.write(line)
    step = signal.fs / fs
    output_f0 = pitch.values_interp[0:signal.size:step]
    return (os.path.splitext(os.path.basename(fname))[0], output_f0)
Exemplo n.º 13
0
def get_f0(audio, rate=16000):
    try:
        import amfm_decompy.basic_tools as basic
        import amfm_decompy.pYAAPT as pYAAPT
        from librosa.util import normalize
    except ImportError:
        raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)."

    assert audio.ndim == 1
    frame_length = 20.0  # ms
    to_pad = int(frame_length / 1000 * rate) // 2

    audio = normalize(audio) * 0.95
    audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0)
    audio = basic.SignalObj(audio, rate)
    pitch = pYAAPT.yaapt(
        audio,
        frame_length=frame_length,
        frame_space=F0_FRAME_SPACE * 1000,
        nccf_thresh1=0.25,
        tda_frame_length=25.0,
    )
    f0 = pitch.samp_values
    return f0
Exemplo n.º 14
0
import os.path

# Declare the variables.
file_name = os.path.dirname(amfm_decompy.__file__)+os.sep+"sample.wav"
window_duration = 0.015   # in seconds
nharm_max = 25
SNR = float('Inf')

# Create the signal object.
signal = basic.SignalObj(file_name)

# Create the window object.
window = pyqhm.SampleWindow(window_duration, signal.fs)

# Create the pitch object and calculate its attributes.
pitch = pyaapt.yaapt(signal)

# Set the number of modulated components.
signal.set_nharm(pitch.values, nharm_max)

# Check if gaussian noise has to be added.
if SNR != float('Inf'):
    signal.noiser(pitch.values, SNR)

# Perform the QHM extraction.
QHM = pyqhm.qhm(signal, pitch, window, 0.001, N_iter = 3, phase_tech = 'phase')

print ("QHM SRER: %s" % (QHM.SRER))

# Perform the aQHM extraction.
aQHM = pyqhm.aqhm(signal, QHM, pitch, window, 0.001, N_iter = 3, N_runs = 2,
Exemplo n.º 15
0
def voiceRecognition():
    def int_or_str(text):
        """Helper function for argument parsing."""
        try:
            return int(text)
        except ValueError:
            return text

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-l',
                        '--list-devices',
                        action='store_true',
                        help='show list of audio devices and exit')
    parser.add_argument('-d',
                        '--device',
                        type=int_or_str,
                        help='input device (numeric ID or substring)')
    parser.add_argument('-r', '--samplerate', type=int, help='sampling rate')
    parser.add_argument('-c',
                        '--channels',
                        type=int,
                        default=1,
                        help='number of input channels')
    parser.add_argument('filename',
                        nargs='?',
                        metavar='FILENAME',
                        help='audio file to store recording to')
    parser.add_argument('-t',
                        '--subtype',
                        type=str,
                        help='sound file subtype (e.g. "PCM_24")')
    args = parser.parse_args()

    # AUDIO_CAPTURING
    import sounddevice as sd
    import soundfile as sf
    import numpy  # Make sure NumPy is loaded before it is used in the callback
    assert numpy  # avoid "imported but unused" message (W0611)

    if args.list_devices:
        print(sd.query_devices())
        parser.exit(0)
    if args.samplerate is None:
        device_info = sd.query_devices(args.device, 'input')
        # soundfile expects an int, sounddevice provides a float:
        args.samplerate = int(device_info['default_samplerate'])
    if args.filename is None:
        args.filename = tempfile.mktemp(prefix='candidate_recording',
                                        suffix='.wav',
                                        dir='')
    q = queue.Queue()

    def callback(indata, frames, time, status):
        """This is called (from a separate thread) for each audio block."""
        if status:
            print(status, file=sys.stderr)
        q.put(indata.copy())

    # Make sure the file is opened before recording anything:
    with sf.SoundFile(args.filename,
                      mode='x',
                      samplerate=args.samplerate,
                      channels=args.channels,
                      subtype=args.subtype) as file:
        with sd.InputStream(samplerate=args.samplerate,
                            device=args.device,
                            channels=args.channels,
                            callback=callback):
            print('#' * 80)
            print('press q to stop the recording')
            print('#' * 80)
            # while True:
            #     file.write(q.get())
            i = 0
            while True:  # making a loop
                #print(i)
                i = i + 1
                file.write(q.get())
                try:  # used try so that if user pressed other than the given key error will not be shown
                    if keyboard.is_pressed('q'):  # if key 'q' is pressed
                        print('You Pressed A Key!')
                        break  # finishing the loop
                    else:
                        pass
                except:
                    break


# except KeyboardInterrupt:
    print('\nRecording finished: ' + repr(args.filename))
    #Python program to transcribe an Audio file

    AUDIO_FILE = (args.filename)

    # use the audio file as the audio source

    r = sr.Recognizer()

    with sr.AudioFile(AUDIO_FILE) as source:
        #reads the audio file. Here we use record instead of
        #listen
        audio = r.record(source)

    try:
        text = r.recognize_google(audio)
        print("The audio file contains: " + text)

    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")

    except sr.RequestError as e:
        print(
            "Could not request results from Google Speech Recognition service; {0}"
            .format(e))

    # write audio to a WAV file
    with open("microphone-results-11223344.wav", "wb") as f:
        f.write(audio.get_wav_data())

    # write the converted text to a TXT file
    t = open('microphone-results-11223344.txt', 'a')
    t.write(text)
    t.close()

    # PITCH_TRACKING
    signal = basic.SignalObj('microphone-results-11223344.wav')
    pitch = pYAAPT.yaapt(signal)
    #print(pitch.samp_values)
    print(len(pitch.samp_values))

    non_zero_pitch = []
    for i in range(len(pitch.samp_values)):
        if pitch.samp_values[i] > 0:
            non_zero_pitch.append(pitch.samp_values[i])
    print("*****************************")
    #print(non_zero_pitch)
    print(len(non_zero_pitch))

    high = []
    low = []
    for i in range(len(non_zero_pitch)):
        if non_zero_pitch[i] > 255:
            high.append(non_zero_pitch[i])
        elif non_zero_pitch[i] < 85:
            low.append(non_zero_pitch[i])

    avg_pitch = np.mean(non_zero_pitch)
    print("The average pitch value is: ", avg_pitch)

    if 85 <= avg_pitch <= 255:
        print("Appropriate Pitch Maintained", len(high), len(low))

    # GAPS_IN_AUDIO
    AudioSegment.converter = r"C:\\ffmpeg\\bin\\ffmpeg.exe"
    myaudio = AudioSegment.from_wav("microphone-results-11223344.wav")
    silent = silence.detect_silence(myaudio,
                                    min_silence_len=100,
                                    silence_thresh=-40)
    silent = [((start / 1000), (stop / 1000))
              for start, stop in silent]  #convert to sec
    print("************************")
    print(silent)
    silent = np.asarray(silent)
    print(silent)
    print(silent.shape)

    diff = []
    count = 0
    for i in range(len(silent)):
        sub = silent[i][1] - silent[i][0]
        diff.append(sub)

    for i in range(len(diff)):
        if diff[i] > 1.3:
            count += 1

    print("Gaps greater than 1.3 seconds: ", count, " times")

    # POLARITY_CALCULATION
    f = open("microphone-results-11223344.txt", "r")
    if f.mode == 'r':
        contents = f.read()

    blob = TextBlob(contents)
    print("The Polarity of the recorded transcript is: ")
    for sentence in blob.sentences:
        print(sentence.sentiment.polarity)

    # SPEECH_RATE
    num_words = 0
    with open("microphone-results-11223344.txt", 'r') as f:
        for line in f:
            words = line.split()
            num_words += len(words)

    print("Number of words:", num_words)
    data_, sampling_rate_ = librosa.load("microphone-results-11223344.wav",
                                         sr=44100)
    secs = np.size(data_) / sampling_rate_
    print("Audio Length: ", str(secs), " seconds")

    silent_zones = np.sum(diff)
    eff_diff = secs - silent_zones
    print("Effective non-silent time period is: ", eff_diff)

    speech_rate = math.ceil((num_words / eff_diff) * 60)
    print("Speech rate is {} words per minute".format(speech_rate))

    if speech_rate < 110:
        print("Not a good speech rate: ", speech_rate)
    elif speech_rate >= 110 and speech_rate <= 165:
        print("Perfect speech rate: ", speech_rate)
    else:
        print("Very fast, either nervous or too excited: ", speech_rate)

    parser.exit(0)
Exemplo n.º 16
0
from schema import Line
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal as sg
from scipy import stats as st

plt.style.use('ggplot')

import amfm_decompy.pYAAPT as pYAAPT
import amfm_decompy.basic_tools as basic

lines = Line.select().where(Line.id << sys.argv[1:])

for line in lines:
    signal = basic.SignalObj(line.audio)
    pitch = pYAAPT.yaapt(signal)

    t = pitch.frames_pos / signal.fs
    p = pitch.samp_interp
    p[p == 0] = np.nan

    fig, (a,b) = plt.subplots(1, 2, sharey=True,
                              num="#%i: " % (line.id) \
                                 + line.text.replace('\n', ' '),
                              figsize=(10,5),
                              )

    #a.plot(t, p, lw=1)
    a.set_ylim(50, 400)
    a.set_xlim(right=np.max(t))
Exemplo n.º 17
0
          'dec_factor': 1,
          'nccf_thresh1': 0.3,
          'nccf_thresh2': 0.9,
          'nccf_maaxcands': 3,
          'nccf_pwidth': 5,       # 5
          'merit_boost': 5,
          'merit_pivot': 0.20,
          'merit_extra': 0.4,
          'median_value': 7,
          'dp_w1': 0.15,
          'dp_w2': 0.5,
          'dp_w3': 100,
          'dp_w4': 0.9
          }

pitch = pYAAPT.yaapt(signal, **params)
frames = preprocessing.frame(silence_remove, frame_length, frame_overlap)
f, t, stft = fea.stft(silence_remove, pic=None, fs=sample_rate, nperseg=frame_length,
         noverlap=frame_overlap, nfft=8192, padded=True, boundary=None)
f,t,stft = scipy.signal.stft(x=silence_remove, fs=sample_rate, window='hann', nperseg=frame_length, noverlap=frame_overlap,
                  nfft=8192, detrend=False, return_onesided=True, boundary='zeros', padded=True, axis=-1)
print(pitch.samp_values.shape[0], frames.shape[1])
for i in range(min(pitch.samp_values.shape[0], frames.shape[1])):
    plt.figure()
    plt.subplot(211)
    X, _ = np.abs(fea.fft_singleside(x=frames[:,i], fs=sample_rate, n=8192, pic=None))
    plt.plot(np.arange(0, 8192/2+1), np.abs(stft[:,i]), 'y')
    plt.axvline(pitch.samp_interp[i], c='b')
    plt.axvline(pitch.samp_values[i], c='g')
    plt.subplot(212)
    plt.plot(np.arange(0, 8192 / 2 + 1), X, 'r')
Exemplo n.º 18
0
speakers = ['awb','bdl','clb','jmk','ksp','rms','slt']
root = os.getcwd()
folderpath = os.path.join(root,'datasets',speakers[0],'wav')
files = sorted(os.listdir(folderpath))


# Read the files
for file in files:
    file = os.path.join(folderpath,file)
    fs,audio = wavread(file)
    break
# IPython.display.Audio(file)









# YAAPT pitches
signal = basic.SignalObj(file)
pitchY = pYAAPT.yaapt(signal, frame_length=25, frame_space=5, f0_min=40, f0_max=300)

plt.plot(pitchY.values_interp, label='YAAPT', color='blue')
plt.xlabel('samples')
plt.ylabel('pitch (Hz)')'

#
def get_pitch_decompy_int(wav):
    signal = basic.SignalObj(wav)
    pitch = pYAAPT.yaapt(signal)
    return remove_silence(pitch.samp_interp)
Exemplo n.º 20
0
def preparePitchFeatureVector(filename):
    ## Find a better way to get pitches for our audio clip
    ## Function Prototype : pysptk.sptk.rapt(x, fs, hopsize, min=60, max=240, voice_bias=0.0, otype='f0')
    ## The current is removing zeroed (unvoiced) pitches
    ## Still have to understand voiced and unvoiced data from audio
    # pitches = pysptk.sptk.rapt(x=soundData, fs=sampleRate, hopsize=totalFrames, min=60, max=250, voice_bias=0.5, otype="f0")
    # pitches = pitches[np.nonzero(pitches)]
    '''-----------------------------------------------------PITCH FEATURES-----------------------------------------------------'''
    ## Example : pitch = pYAAPT.yaapt(signal, **{'f0_min' : 150.0, 'frame_length' : 15.0, 'frame_space' : 5.0})
    signal = basic.SignalObj(filename)
    pitch = pYAAPT.yaapt(
        signal, **{
            'f0_min': 60.0,
            'f0_max': 360,
            'frame_length': 25.0,
            'frame_space': 10.0
        })
    pitches = pitch.samp_values

    testSampleRate, testSoundData = wav.read(filename)

    getPitchesForEachFrame(testSoundData, testSampleRate)

    ## Getting the voiced part of the sound clip
    boolVoiced = np.array([])

    for i in range(len(pitches)):
        if (pitches[i] == 0):
            boolVoiced = np.append(boolVoiced, 0)
        else:
            boolVoiced = np.append(boolVoiced, 1)

    derivativeOfPitches = np.array([])

    counter = 0
    for i in pitches:
        if counter == 0:
            delta = i
            derivativeOfPitches = np.append(derivativeOfPitches, delta)
        else:
            delta = i - prev
            derivativeOfPitches = np.append(derivativeOfPitches, delta)
        prev = i
        counter += 1

    derivativeOfPitches = np.array(
        np.split(derivativeOfPitches, pitches.shape[0]))

    # print(boolVoiced)

    # sampleRate, soundData = wav.read(filename)
    # plt.subplot(2, 2, 1)
    # plt.plot(soundData)
    # plt.subplot(2, 2, 3)
    # plt.plot(pitch.samp_values)
    # plt.subplot(2, 2, 4)
    # plt.plot(boolVoiced)
    # plt.subplot(2, 2, 2)
    # plt.plot(voicedData(filename))
    # plt.show()
    # print("Pitches frames", pitches.shape)

    pitchFeatureVector = np.array([])

    ## soundData statistics
    ## Features 0 to 4
    pitchFeatureVector = np.append(pitchFeatureVector, np.mean(pitches))
    pitchFeatureVector = np.append(pitchFeatureVector, np.median(pitches))
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.max(pitches[np.nonzero(pitches)]))
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.min(pitches[np.nonzero(pitches)]))
    pitchFeatureVector = np.append(pitchFeatureVector, np.var(pitches))

    ## soundData Derivative statistics
    ## Features 5 to 9
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.mean(derivativeOfPitches))
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.median(derivativeOfPitches))
    pitchFeatureVector = np.append(
        pitchFeatureVector,
        np.max(derivativeOfPitches[np.nonzero(derivativeOfPitches)]))
    pitchFeatureVector = np.append(
        pitchFeatureVector,
        np.min(derivativeOfPitches[np.nonzero(derivativeOfPitches)]))
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.var(derivativeOfPitches))

    # print(pitchFeatureVector.shape)
    '''-----------------------------------------------------PITCH FEATURES-----------------------------------------------------'''
    '''-----------------------------------------------------AVERAGE ENERGIES-----------------------------------------------------'''
    sr, sd = wav.read(filename)
    # boolVoiced = voicedData(sd, sr)

    frame = 50  #ms

    ## Finding the frame length for sound corresponding to the time frames
    frame = int(sr * (frame / 1000))

    ## Finding the total number of frames of our sound
    totalFrames = sd.shape[0] / frame
    framesToCut = int(frame * floor(totalFrames))

    ## Padding the overflowing
    valuesToPad = sd.shape[0] - framesToCut
    soundData = np.pad(sd, (0, frame - valuesToPad), 'constant')
    totalFrames = soundData.shape[0] / frame

    soundData = np.array(np.split(soundData, totalFrames))
    # soundData = soundData[:-1]

    ## Average energies of soundData
    ## Have to ratio it according to voiced and unvoiced data
    ## Function prototype :  librosa.feature.rmse(y=None, S=None, frame_length=2048, hop_length=512, center=True, pad_mode='reflect)
    # averageEnergies = np.mean(librosa.feature.rmse(y=soundData, hop_length=int(totalFrames), center=True, pad_mode='reflect').T, axis=0)[0]
    voicedEnergies = np.array([])
    unvoicedEnergies = np.array([])
    '''
    for i in range(boolVoiced.shape[0]):
        if (boolVoiced[i] == 0):
            unvoicedEnergies = np.append(unvoicedEnergies, AFE.stEnergy(soundData[i]))
        else:
            voicedEnergies = np.append(voicedEnergies, AFE.stEnergy(soundData[i]))
 
    ## Feature 10
    # print("voicedEnergies", voicedEnergies)
    voicedEnergies = np.mean(voicedEnergies)
    pitchFeatureVector = np.append(pitchFeatureVector, voicedEnergies)
    ## Feature 11
    unvoicedEnergies = np.mean(unvoicedEnergies)
    # print("unvoicedEnergies", voicedEnergies)
    pitchFeatureVector = np.append(pitchFeatureVector, unvoicedEnergies)
    ## Checking for NaN values
    if (np.isnan(pitchFeatureVector[11])):
        pitchFeatureVector[11] = 0
 
    #-----------------------------------------------------AVERAGE ENERGIES-----------------------------------------------------
 
    #------------------------------------------------------SPEAKING RATE------------------------------------------------------
    ## Speaking rate of soundData (inverse of the average length of the voiced part of an utterance)
    voicedParts = np.array([])
    # print(boolVoiced)
 
    LENGTH = 0
    for i in range(boolVoiced.shape[0]):
        if (boolVoiced[i] == 1):
            LENGTH += 1
        elif (LENGTH > 0 and boolVoiced[i] == 0):
            ## 50 because thats the frame length we are going with.
            voicedParts = np.append(voicedParts, LENGTH*(50/1000))
            LENGTH = 0
     
    if (LENGTH != 0):
        voicedParts = np.append(voicedParts, LENGTH*(50/1000))
        LENGTH = 0
 
 
    # print(voicedParts)
    ## Speaking rate made out to be in words per second
    # print(boolVoiced)     
    speakingRate = 1 / (np.mean(voicedParts))
    # print("Speaking rate :", speakingRate)
    ## Feature 12
    pitchFeatureVector = np.append(pitchFeatureVector, speakingRate)
    #------------------------------------------------------SPEAKING RATE------------------------------------------------------
    '''

    return pitchFeatureVector
Exemplo n.º 21
0
import amfm_decompy.basic_tools as basic
import amfm_decompy.pYAAPT as pYAAPT
import matplotlib.pyplot as plt
import numpy as np
import sys

if __name__ == "__main__":
    # load audio
    print(sys.argv[1])
    filename = sys.argv[1]
    signal = basic.SignalObj(filename)
    # YAAPT pitches
    pitchY = pYAAPT.yaapt(signal,
                          frame_length=40,
                          tda_frame_length=40,
                          f0_min=75,
                          f0_max=600)
    #get values
    val = pitchY.values_interp
    pred_UPDRS = 15.82 - 0.376 * np.median(
        val) + 0.305 * val.mean() - .024 * val.std() - .005 * val.max()
    plt.plot(val)
    plt.xlabel('Nanoseconds')
    plt.ylabel('Pitch (hz)')
    plt.title('Pitch over time')
    plt.savefig('person_name.png')
    print('$')
    print(pred_UPDRS)
Exemplo n.º 22
0
import os.path

# Declare the variables.
file_name = os.path.dirname(amfm_decompy.__file__) + os.sep + "sample.wav"
window_duration = 0.015  # in seconds
nharm_max = 25
SNR = float('Inf')

# Create the signal object.
signal = basic.SignalObj(file_name)

# Create the window object.
window = pyqhm.SampleWindow(window_duration, signal.fs)

# Create the pitch object and calculate its attributes.
pitch = pyaapt.yaapt(signal)

# Set the number of modulated components.
signal.set_nharm(pitch.values, nharm_max)

# Check if gaussian noise has to be added.
if SNR != float('Inf'):
    signal.noiser(pitch.values, SNR)

# Perform the QHM extraction.
QHM = pyqhm.qhm(signal, pitch, window, 0.001, N_iter=3, phase_tech='phase')

print("QHM SRER: {}".format(QHM.SRER))

# Perform the aQHM extraction.
aQHM = pyqhm.aqhm(signal,
Exemplo n.º 23
0
	def avgTargetF0(self, targetSound):
		ipt = basic.SignalObj(targetSound)
		pch = pyaapt.yaapt(ipt, **{'frame_length':30.0, 'f0-min': 10.0, 'f0-max': 300.0, 'frame_space': 20.0})
		nonZero = filter(lambda x: x > 0, pch.values)
		print(np.mean(nonZero))
		return np.mean(nonZero)