示例#1
0
def extractF0_wrapper(input_wav,
                      min_f0=60,
                      max_f0=600,
                      frame_length=25,
                      frame_shift=5):
    """extractF0_wrapper(input_wav, min_f0 = 60, max_f0 = 600, 
                      frame_length = 25, frame_shift = 5)
    
    input
    -----
      input_wav: string, path to the input waveform
      min_f0: int, minimum F0 (default 60 Hz)
      max_f0: int, maximum F0 (default 600 Hz)
      frame_length, int, analysis frame length in ms (default 25ms)
      frame_shift: int, analysis frame shift in ms (default 5ms)
    
    output
    ------
      f0: np.array, f0 sequence in shape [number_of_frame]
    """
    if os.path.isfile(input_wav):
        signal = basic.SignalObj(input_wav)
        pitch = pYAAPT.yaapt(
            signal, **{
                'f0_min': min_f0,
                'f0_max': max_f0,
                'frame_length': frame_length,
                'frame_space': frame_shift
            })
        f0_value = pitch.samp_values
        f0_value = numpy.array(f0_value, dtype=numpy.float32)
        return f0_value
    else:
        print("Cannot find {:s}".format(input_wav))
        return None
示例#2
0
def extract_pitches():
    for movie in Movie.select().where(Movie.skip_line == True,
                                      Movie.skip_snippet == True,
                                      Movie.skip_pitch == False):

        print "Extracting pitches for '%s'" % movie.title

        bar = progressbar.ProgressBar()
        for line in bar(movie.lines.where(Line.pitch == None)):
            with db.transaction():

                try:
                    signal = basic.SignalObj(_full_path(line.audio))
                    pitch = pYAAPT.yaapt(signal)

                    t = pitch.frames_pos / signal.fs

                    # Gaussian filter
                    kern = sg.gaussian(20, 2)
                    lp = sg.filtfilt(kern, np.sum(kern), pitch.samp_interp)
                    lp[pitch.samp_values == 0] = np.nan

                    line.pitch = np.vstack((t, lp))
                except Exception as e:
                    print e
                    line.pitch = None

                line.save()

        movie.skip_pitch = True
        movie.save()
示例#3
0
def extractF0(input_wav,
              output_f0,
              min_f0=60,
              max_f0=400,
              frame_length=35,
              frame_shift=10):
    if os.path.isfile(input_wav):
        signal = basic.SignalObj(input_wav)
        pitch = pYAAPT.yaapt(
            signal, **{
                'f0_min': min_f0,
                'f0_max': max_f0,
                'frame_length': frame_length,
                'frame_space': frame_shift
            })
        f0_value = pitch.samp_values
        datatype = numpy.dtype(('<f4', 1))
        f0_value = numpy.asarray(f0_value, dtype=datatype)

        f = open(output_f0, 'wb')
        f0_value.tofile(f, '')
        f.close()
        print("F0 processed: %s" % (output_f0))
    else:
        print("Cannot find %s" % (input_wav))
    return
def get_pitch_decompy_values(wav, remove_silencess = True, interpolate = True):
    #print('get_pitch_decompy_values\npath = {}\n'.format(wav))
    signal = basic.SignalObj(wav)
    '''
    plt.title('signal')
    plt.plot(signal.data, color = 'm')
    '''
    pitch = pYAAPT.yaapt(signal)
    
    ynew = pitch.samp_values

    rv = len(pitch.samp_values)
    sv = 0
    iv = 0
    
    # toma el pitch quita los silencios del inicio y el final y aplica
    # spline_interpolation para quitar rellenar por interpolacion los silencios intermedios
    if remove_silencess:
        ynew, _, _ =  remove_silence(ynew)
        sv = len(ynew)

    #print('  y_before_remove_silence = {}'.format(len(pitch.samp_values)))
    #print('  y_to_spline_len = {}'.format(len(ynew)))
    if interpolate:
        #print('     interpolating')
        #_, _, ynew, _ = spline_interpolation(ynew)
        ynew = spline_interpolation(ynew)
        iv = len(ynew)
    
    return ynew
示例#5
0
def extract_and_analyze_pitches():
    for movie in Movie.select().where(Movie.skip_line == True,
                                      Movie.skip_snippet == True,
                                      Movie.skip_pitch == False):

        print "Extracting pitches for '%s'" % movie.title

        bar = progressbar.ProgressBar()
        for line in bar(movie.lines.where(Line.pitch == None)):
            with db.transaction():

                try:
                    signal = basic.SignalObj(_full_path(line.audio))
                    pitch = pYAAPT.yaapt(signal)

                    # Gaussian filter
                    kern = sg.gaussian(20, 2)
                    lp = sg.filtfilt(kern, np.sum(kern), pitch.samp_interp)
                    lp[pitch.samp_values == 0] = np.nan

                    kde = st.gaussian_kde(lp[~np.isnan(lp)])
                    locs = np.linspace(50, 400, 100)
                    vals = kde.evaluate(locs)

                    peak = locs[np.argmax(vals)]
                    line.sextimate = peak

                except Exception as e:
                    print e
                    line.pitch = None

                line.save()

        movie.skip_pitch = True
        movie.save()
示例#6
0
def audio_pitch(file_name, demo_path, media_root):
    # load audio
    source_path = '{media_root}/dtw/{file_name}.wav'
    source_path = source_path.format(media_root=media_root, file_name=file_name)
    signal_source = basic.SignalObj(source_path)
    signal_target = basic.SignalObj(demo_path)

    # YAAPT pitches
    pitches_source = pYAAPT.yaapt(signal_source, frame_length=40, tda_frame_length=40, fft_length=2048, f0_min=75,
                                  f0_max=600)
    pitches_target = pYAAPT.yaapt(signal_target, frame_length=40, tda_frame_length=40, fft_length=2048, f0_min=75,
                                  f0_max=600)
    # Main
    wav, fs = librosa.load(source_path, sr=None)
    output = numpy.full(shape=(len(wav)), fill_value=0, dtype='float32')
    length = 4096
    for i in range(0, len(wav), int(length / 6)):
        # time: /10ms
        time = int(i / fs * 100)
        if (time < len(pitches_source.samp_values) and time < len(pitches_target.samp_values)):
            source_pitch = pitches_source.samp_values[time]
            target_pitch = pitches_target.samp_values[time]
            # 底数常量为相邻两个音高频率关系
            # 例如:A3音符与A4的频率分别为220.0Hz,440.0Hz, 根据十二平均律已知两个音音阶差为12,设常量为t,频率关系则为 440 = 220 * t ** 12, t = pow(440 / 220, 1.0/12)
            n_steps = 0
            if (source_pitch != 0 and target_pitch != 0):
                n_steps = math.log(target_pitch / source_pitch, pow(2, 1.0 / 12))
            new_frame = librosa.effects.pitch_shift(y=numpy.hanning(len(wav[i:i + length])) * wav[i:i + length], sr=fs,
                                                    n_steps=n_steps)
            output[i:i + length] += new_frame

    # 混响效果,听起来效果不是很好。
    # fx = (
    #     AudioEffectsChain()
    #     .highshelf()
    #     .reverb()
    #     # .phaser()
    #     .lowshelf()
    # )
    # output = fx(output)

    # 写入文件
    output_path = '{media_root}/pitch/{file_name}.wav'
    output_path = output_path.format(media_root=media_root, file_name=file_name)
    librosa.output.write_wav(output_path, output, fs)
示例#7
0
def preprocessing(ii):
    fname = nameNsize[ii][0]
    line = nameNsize[ii][1]
    phone = []
    for item in line.split(' '):
        temp = G2P(item)
        phone += temp.split()
        if item.find('!') != -1:
            phone += '!'
        elif item.find('?') != -1:
            phone += '?'
        elif item.find('.') != -1:
            phone += '.'
        elif item.find(',') != -1:
            phone += ','
        phone += ' '
    text = phone[:-1] + ['E']
    dic = [
        'P', '.', '!', '?', ',', 'k0', 'kk', 'nn', 't0', 'tt', 'rr', 'mm',
        'p0', 'pp', 's0', 'ss', 'oh', 'c0', 'cc', 'ch', 'kh', 'th', 'ph', 'h0',
        'aa', 'qq', 'ya', 'yq', 'vv', 'ee', 'yv', 'ye', 'oo', 'wa', 'wq', 'wo',
        'yo', 'uu', 'wv', 'we', 'wi', 'yu', 'xx', 'xi', 'ii', '', 'kf', 'ks',
        'nf', 'nc', 'ng', 'nh', 'tf', 'll', 'lk', 'lm', 'lb', 'ls', 'lt', 'lp',
        'lh', 'mf', 'pf', 'ps', ' ', 'E'
    ]
    char2idx = {ch: idx for idx, ch in enumerate(dic)}
    emblen = len(char2idx)
    txt = np.asarray([char2idx[ch] for ch in text])
    audio, sr = librosa.load('../kss/{}'.format(fname), sr=22050)
    audio, index = librosa.effects.trim(audio,
                                        top_db=43,
                                        frame_length=256,
                                        hop_length=64)
    audioobj = basic.SignalObj(audio, sr)
    pitch = pYAAPT.yaapt(
        audioobj, **{
            'f0_min': 100.0,
            'frame_length': 1000 * 1024 // 22050,
            'frame_space': 1000 * 256 // 22050
        })
    pitch = pitch.samp_values
    stft = np.abs(
        librosa.stft(audio, n_fft=1024, hop_length=256, win_length=1024))
    stft = np.power(stft / np.max(stft), 0.6)
    mel_filters = librosa.filters.mel(22050, 1024, 80)
    mel = np.dot(mel_filters, stft)
    mel = np.power(mel / np.max(mel), 0.6)
    mel = mel[:, ::4]
    pitch = pitch[::4]
    length = np.min([np.shape(mel)[1], np.shape(stft)[1] // 4])
    mel = mel[:, :length]
    stft = stft[:, :length * 4]
    pitch = pitch[:length]
    np.save('data/sample_{}.npy'.format((str)(ii).zfill(5)),
            (txt, len(txt), mel, stft, mel.shape[1], pitch))
    print('\r{}saved'.format(ii), end='')
示例#8
0
def getf0_viaYAAPT(music_loc):

    signal = basic.SignalObj(music_loc)
    pitch = pYAAPT.yaapt(signal, **{'f0_max': 2600.0})

    pitch_interp = pitch.samp_interp
    pitch_interp = lpf(pitch_interp, 15, 100)
    pitch_interp = medfilt(pitch_interp, 45)

    return pitch_interp.tolist()
示例#9
0
def extract_pitch(path):
    """
    Method to extract pitch values and energy
    :param path: path to the audio file
    :return: pitch values and pitch energy, averaged over number of frames
    """
    signal = basic.SignalObj(path)
    pitch = pYAAPT.yaapt(signal)
    avg_pitch = sum(map(np.array, pitch.samp_values)) / pitch.nframes
    avg_pitch_energy = sum(map(np.array, pitch.energy)) / pitch.nframes
    return avg_pitch, avg_pitch_energy
示例#10
0
def convert_wav_to_f0_ascii(fname, fs, directory=''):
    print 'convert ', fname
    # Create the signal object.
    signal = basic.SignalObj(fname)
    # Get time interval and num_samples
    t_start = 0.0
    num_samples = signal.size
    t_end = num_samples / signal.fs
    t = np.linspace(t_start, t_end, num_samples)
    # Create the pitch object and calculate its attributes.
    pitch = pyaapt.yaapt(signal)
    # Create .f0_ascii file and dump f0 to it
    output_fname = directory+os.path.splitext(os.path.basename(fname))[0]+'.f0_ascii'
    with open(output_fname, 'wb') as f:
        for i in range(pitch.nframes):
            f0 = pitch.samp_values[i]
            vu = 1.0 if pitch.vuv[i] else 0.0
            fe = pitch.energy[i] * pitch.mean_energy
            line = '{} {} {} {}\n'.format(f0, vu, fe, vu)
            f.write(line)
    step = signal.fs / fs
    output_f0 = pitch.values_interp[0:signal.size:step]
    return (os.path.splitext(os.path.basename(fname))[0], output_f0)
示例#11
0
def get_f0(audio, rate=16000):
    try:
        import amfm_decompy.basic_tools as basic
        import amfm_decompy.pYAAPT as pYAAPT
        from librosa.util import normalize
    except ImportError:
        raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)."

    assert audio.ndim == 1
    frame_length = 20.0  # ms
    to_pad = int(frame_length / 1000 * rate) // 2

    audio = normalize(audio) * 0.95
    audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0)
    audio = basic.SignalObj(audio, rate)
    pitch = pYAAPT.yaapt(
        audio,
        frame_length=frame_length,
        frame_space=F0_FRAME_SPACE * 1000,
        nccf_thresh1=0.25,
        tda_frame_length=25.0,
    )
    f0 = pitch.samp_values
    return f0
def get_pitch_decompy_int(wav):
    signal = basic.SignalObj(wav)
    pitch = pYAAPT.yaapt(signal)
    return remove_silence(pitch.samp_interp)
示例#13
0
speakers = ['awb','bdl','clb','jmk','ksp','rms','slt']
root = os.getcwd()
folderpath = os.path.join(root,'datasets',speakers[0],'wav')
files = sorted(os.listdir(folderpath))


# Read the files
for file in files:
    file = os.path.join(folderpath,file)
    fs,audio = wavread(file)
    break
# IPython.display.Audio(file)









# YAAPT pitches
signal = basic.SignalObj(file)
pitchY = pYAAPT.yaapt(signal, frame_length=25, frame_space=5, f0_min=40, f0_max=300)

plt.plot(pitchY.values_interp, label='YAAPT', color='blue')
plt.xlabel('samples')
plt.ylabel('pitch (Hz)')'

#
示例#14
0
def yaapt(signal, **kwargs):

    # Rename the YAAPT v4.0 parameter "frame_lengtht" to "tda_frame_length"
    # (if provided).
    if 'frame_lengtht' in kwargs:
        if 'tda_frame_length' in kwargs:
            warning_str = 'WARNING: Both "tda_frame_length" and "frame_lengtht" '
            warning_str += 'refer to the same parameter. Therefore, the value '
            warning_str += 'of "frame_lengtht" is going to be discarded.'
            print(warning_str)
        else:
            kwargs['tda_frame_length'] = kwargs.pop('frame_lengtht')

    #---------------------------------------------------------------
    # Set the default values for the parameters.
    #---------------------------------------------------------------
    parameters = {}
    parameters['frame_length'] = kwargs.get(
        'frame_length', 35.0)  #Length of each analysis frame (ms)
    # WARNING: In the original MATLAB YAAPT 4.0 code the next parameter is called
    # "frame_lengtht" which is quite similar to the previous one "frame_length".
    # Therefore, I've decided to rename it to "tda_frame_length" in order to
    # avoid confusion between them. Nevertheless, both inputs ("frame_lengtht"
    # and "tda_frame_length") are accepted when the function is called.
    parameters['tda_frame_length'] = \
                              kwargs.get('tda_frame_length', 35.0)  #Frame length employed in the time domain analysis (ms)
    parameters['frame_space'] = kwargs.get(
        'frame_space', 10.0)  #Spacing between analysis frames (ms)
    parameters['f0_min'] = kwargs.get('f0_min',
                                      60.0)  #Minimum F0 searched (Hz)
    parameters['f0_max'] = kwargs.get('f0_max',
                                      400.0)  #Maximum F0 searched (Hz)
    parameters['fft_length'] = kwargs.get('fft_length', 8192)  #FFT length
    parameters['bp_forder'] = kwargs.get('bp_forder',
                                         150)  #Order of band-pass filter
    parameters['bp_low'] = kwargs.get(
        'bp_low', 50.0)  #Low frequency of filter passband (Hz)
    parameters['bp_high'] = kwargs.get(
        'bp_high', 1500.0)  #High frequency of filter passband (Hz)
    parameters['nlfer_thresh1'] = kwargs.get(
        'nlfer_thresh1', 0.75)  #NLFER boundary for voiced/unvoiced decisions
    parameters['nlfer_thresh2'] = kwargs.get(
        'nlfer_thresh2', 0.1)  #Threshold for NLFER definitely unvoiced
    parameters['shc_numharms'] = kwargs.get(
        'shc_numharms', 3)  #Number of harmonics in SHC calculation
    parameters['shc_window'] = kwargs.get('shc_window',
                                          40.0)  #SHC window length (Hz)
    parameters['shc_maxpeaks'] = kwargs.get(
        'shc_maxpeaks', 4)  #Maximum number of SHC peaks to be found
    parameters['shc_pwidth'] = kwargs.get(
        'shc_pwidth', 50.0)  #Window width in SHC peak picking (Hz)
    parameters['shc_thresh1'] = kwargs.get(
        'shc_thresh1', 5.0)  #Threshold 1 for SHC peak picking
    parameters['shc_thresh2'] = kwargs.get(
        'shc_thresh2', 1.25)  #Threshold 2 for SHC peak picking
    parameters['f0_double'] = kwargs.get(
        'f0_double', 150.0)  #F0 doubling decision threshold (Hz)
    parameters['f0_half'] = kwargs.get(
        'f0_half', 150.0)  #F0 halving decision threshold (Hz)
    parameters['dp5_k1'] = kwargs.get('dp5_k1',
                                      11.0)  #Weight used in dynamic program
    parameters['dec_factor'] = kwargs.get('dec_factor',
                                          1)  #Factor for signal resampling
    parameters['nccf_thresh1'] = kwargs.get(
        'nccf_thresh1', 0.3)  #Threshold for considering a peak in NCCF
    parameters['nccf_thresh2'] = kwargs.get(
        'nccf_thresh2', 0.9)  #Threshold for terminating serach in NCCF
    parameters['nccf_maxcands'] = kwargs.get(
        'nccf_maxcands', 3)  #Maximum number of candidates found
    parameters['nccf_pwidth'] = kwargs.get(
        'nccf_pwidth', 5)  #Window width in NCCF peak picking
    parameters['merit_boost'] = kwargs.get('merit_boost', 0.20)  #Boost merit
    parameters['merit_pivot'] = kwargs.get(
        'merit_pivot', 0.99)  #Merit assigned to unvoiced candidates in
    #defintely unvoiced frames
    parameters['merit_extra'] = kwargs.get(
        'merit_extra', 0.4)  #Merit assigned to extra candidates
    #in reducing F0 doubling/halving errors
    parameters['median_value'] = kwargs.get('median_value',
                                            7)  #Order of medial filter
    parameters['dp_w1'] = kwargs.get(
        'dp_w1', 0.15)  #DP weight factor for V-V transitions
    parameters['dp_w2'] = kwargs.get(
        'dp_w2', 0.5)  #DP weight factor for V-UV or UV-V transitions
    parameters['dp_w3'] = kwargs.get(
        'dp_w3', 0.1)  #DP weight factor of UV-UV transitions
    parameters['dp_w4'] = kwargs.get('dp_w4',
                                     0.9)  #Weight factor for local costs

    #---------------------------------------------------------------
    # Create the signal objects and filter them.
    #---------------------------------------------------------------
    fir_filter = BandpassFilter(signal.fs, parameters)
    nonlinear_sign = basic.SignalObj(signal.data**2, signal.fs)

    signal.filtered_version(fir_filter)
    nonlinear_sign.filtered_version(fir_filter)

    #---------------------------------------------------------------
    # Create the pitch object.
    #---------------------------------------------------------------
    nfft = parameters['fft_length']
    frame_size = int(np.fix(parameters['frame_length'] * signal.fs / 1000))
    frame_jump = int(np.fix(parameters['frame_space'] * signal.fs / 1000))
    pitch = PitchObj(frame_size, frame_jump, nfft)

    assert pitch.frame_size > 15, 'Frame length value {} is too short.'.format(
        pitch.frame_size)
    assert pitch.frame_size < 2048, 'Frame length value {} exceeds the limit.'.format(
        pitch.frame_size)

    #---------------------------------------------------------------
    # Calculate NLFER and determine voiced/unvoiced frames.
    #---------------------------------------------------------------
    nlfer(signal, pitch, parameters)

    #---------------------------------------------------------------
    # Calculate an approximate pitch track from the spectrum.
    #---------------------------------------------------------------
    spec_pitch, pitch_std = spec_track(nonlinear_sign, pitch, parameters)

    #---------------------------------------------------------------
    # Temporal pitch tracking based on NCCF.
    #---------------------------------------------------------------
    time_pitch1, time_merit1 = time_track(signal, spec_pitch, pitch_std, pitch,
                                          parameters)

    time_pitch2, time_merit2 = time_track(nonlinear_sign, spec_pitch,
                                          pitch_std, pitch, parameters)

    # Added in YAAPT 4.0
    if time_pitch1.shape[1] < len(spec_pitch):
        len_time = time_pitch1.shape[1]
        len_spec = len(spec_pitch)
        time_pitch1 = np.concatenate(
            (time_pitch1,
             np.zeros((3, len_spec - len_time), dtype=time_pitch1.dtype)),
            axis=1)
        time_pitch2 = np.concatenate(
            (time_pitch2,
             np.zeros((3, len_spec - len_time), dtype=time_pitch2.dtype)),
            axis=1)
        time_merit1 = np.concatenate(
            (time_merit1,
             np.zeros((3, len_spec - len_time), dtype=time_merit1.dtype)),
            axis=1)
        time_merit2 = np.concatenate(
            (time_merit2,
             np.zeros((3, len_spec - len_time), dtype=time_merit2.dtype)),
            axis=1)

    #---------------------------------------------------------------
    # Refine pitch candidates.
    #---------------------------------------------------------------
    ref_pitch, ref_merit = refine(time_pitch1, time_merit1, time_pitch2,
                                  time_merit2, spec_pitch, pitch, parameters)

    #---------------------------------------------------------------
    # Use dyanamic programming to determine the final pitch.
    #---------------------------------------------------------------
    final_pitch = dynamic(ref_pitch, ref_merit, pitch, parameters)

    pitch.set_values(final_pitch, signal.size)

    return pitch
示例#15
0
def voiceRecognition():
    def int_or_str(text):
        """Helper function for argument parsing."""
        try:
            return int(text)
        except ValueError:
            return text

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-l',
                        '--list-devices',
                        action='store_true',
                        help='show list of audio devices and exit')
    parser.add_argument('-d',
                        '--device',
                        type=int_or_str,
                        help='input device (numeric ID or substring)')
    parser.add_argument('-r', '--samplerate', type=int, help='sampling rate')
    parser.add_argument('-c',
                        '--channels',
                        type=int,
                        default=1,
                        help='number of input channels')
    parser.add_argument('filename',
                        nargs='?',
                        metavar='FILENAME',
                        help='audio file to store recording to')
    parser.add_argument('-t',
                        '--subtype',
                        type=str,
                        help='sound file subtype (e.g. "PCM_24")')
    args = parser.parse_args()

    # AUDIO_CAPTURING
    import sounddevice as sd
    import soundfile as sf
    import numpy  # Make sure NumPy is loaded before it is used in the callback
    assert numpy  # avoid "imported but unused" message (W0611)

    if args.list_devices:
        print(sd.query_devices())
        parser.exit(0)
    if args.samplerate is None:
        device_info = sd.query_devices(args.device, 'input')
        # soundfile expects an int, sounddevice provides a float:
        args.samplerate = int(device_info['default_samplerate'])
    if args.filename is None:
        args.filename = tempfile.mktemp(prefix='candidate_recording',
                                        suffix='.wav',
                                        dir='')
    q = queue.Queue()

    def callback(indata, frames, time, status):
        """This is called (from a separate thread) for each audio block."""
        if status:
            print(status, file=sys.stderr)
        q.put(indata.copy())

    # Make sure the file is opened before recording anything:
    with sf.SoundFile(args.filename,
                      mode='x',
                      samplerate=args.samplerate,
                      channels=args.channels,
                      subtype=args.subtype) as file:
        with sd.InputStream(samplerate=args.samplerate,
                            device=args.device,
                            channels=args.channels,
                            callback=callback):
            print('#' * 80)
            print('press q to stop the recording')
            print('#' * 80)
            # while True:
            #     file.write(q.get())
            i = 0
            while True:  # making a loop
                #print(i)
                i = i + 1
                file.write(q.get())
                try:  # used try so that if user pressed other than the given key error will not be shown
                    if keyboard.is_pressed('q'):  # if key 'q' is pressed
                        print('You Pressed A Key!')
                        break  # finishing the loop
                    else:
                        pass
                except:
                    break


# except KeyboardInterrupt:
    print('\nRecording finished: ' + repr(args.filename))
    #Python program to transcribe an Audio file

    AUDIO_FILE = (args.filename)

    # use the audio file as the audio source

    r = sr.Recognizer()

    with sr.AudioFile(AUDIO_FILE) as source:
        #reads the audio file. Here we use record instead of
        #listen
        audio = r.record(source)

    try:
        text = r.recognize_google(audio)
        print("The audio file contains: " + text)

    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")

    except sr.RequestError as e:
        print(
            "Could not request results from Google Speech Recognition service; {0}"
            .format(e))

    # write audio to a WAV file
    with open("microphone-results-11223344.wav", "wb") as f:
        f.write(audio.get_wav_data())

    # write the converted text to a TXT file
    t = open('microphone-results-11223344.txt', 'a')
    t.write(text)
    t.close()

    # PITCH_TRACKING
    signal = basic.SignalObj('microphone-results-11223344.wav')
    pitch = pYAAPT.yaapt(signal)
    #print(pitch.samp_values)
    print(len(pitch.samp_values))

    non_zero_pitch = []
    for i in range(len(pitch.samp_values)):
        if pitch.samp_values[i] > 0:
            non_zero_pitch.append(pitch.samp_values[i])
    print("*****************************")
    #print(non_zero_pitch)
    print(len(non_zero_pitch))

    high = []
    low = []
    for i in range(len(non_zero_pitch)):
        if non_zero_pitch[i] > 255:
            high.append(non_zero_pitch[i])
        elif non_zero_pitch[i] < 85:
            low.append(non_zero_pitch[i])

    avg_pitch = np.mean(non_zero_pitch)
    print("The average pitch value is: ", avg_pitch)

    if 85 <= avg_pitch <= 255:
        print("Appropriate Pitch Maintained", len(high), len(low))

    # GAPS_IN_AUDIO
    AudioSegment.converter = r"C:\\ffmpeg\\bin\\ffmpeg.exe"
    myaudio = AudioSegment.from_wav("microphone-results-11223344.wav")
    silent = silence.detect_silence(myaudio,
                                    min_silence_len=100,
                                    silence_thresh=-40)
    silent = [((start / 1000), (stop / 1000))
              for start, stop in silent]  #convert to sec
    print("************************")
    print(silent)
    silent = np.asarray(silent)
    print(silent)
    print(silent.shape)

    diff = []
    count = 0
    for i in range(len(silent)):
        sub = silent[i][1] - silent[i][0]
        diff.append(sub)

    for i in range(len(diff)):
        if diff[i] > 1.3:
            count += 1

    print("Gaps greater than 1.3 seconds: ", count, " times")

    # POLARITY_CALCULATION
    f = open("microphone-results-11223344.txt", "r")
    if f.mode == 'r':
        contents = f.read()

    blob = TextBlob(contents)
    print("The Polarity of the recorded transcript is: ")
    for sentence in blob.sentences:
        print(sentence.sentiment.polarity)

    # SPEECH_RATE
    num_words = 0
    with open("microphone-results-11223344.txt", 'r') as f:
        for line in f:
            words = line.split()
            num_words += len(words)

    print("Number of words:", num_words)
    data_, sampling_rate_ = librosa.load("microphone-results-11223344.wav",
                                         sr=44100)
    secs = np.size(data_) / sampling_rate_
    print("Audio Length: ", str(secs), " seconds")

    silent_zones = np.sum(diff)
    eff_diff = secs - silent_zones
    print("Effective non-silent time period is: ", eff_diff)

    speech_rate = math.ceil((num_words / eff_diff) * 60)
    print("Speech rate is {} words per minute".format(speech_rate))

    if speech_rate < 110:
        print("Not a good speech rate: ", speech_rate)
    elif speech_rate >= 110 and speech_rate <= 165:
        print("Perfect speech rate: ", speech_rate)
    else:
        print("Very fast, either nervous or too excited: ", speech_rate)

    parser.exit(0)
示例#16
0
import numpy as np
import pandas as pd
import pitch
import tensorflow as tf


def get_tone_digit(tone_list: dict) -> int:
    for key, value in tone_list.items():
        if value == 1:
            return int(key[-1])


f0_df = pd.DataFrame()
for val in range(0, 3108):
    signal = basic.SignalObj(
        '/home/dattilo/Documents/Project/Data Sources/Audio2-0/Audio2-' +
        str(val + 1).zfill(2) + '.wav')
    pitchY = pYAAPT.yaapt(signal,
                          frame_length=40,
                          tda_frame_length=40,
                          f0_min=75,
                          f0_max=600)  # YIN pitches
    f0_df = f0_df.append(pd.DataFrame([[val] + pitchY.samp_values.tolist()]))
text_df = pd.read_csv(
    '/home/dattilo/Documents/Project/Data Sources/truyenkieuwordnumber.txt',
    sep=' ',
    names=['index', 'word'])
text_df['tone_2'] = (
    text_df['word'].str.contains('á|é|í|ó|ú|ý|ắ|ấ|ế|ố|ớ|ứ')).astype(int)
text_df['tone_3'] = (
    text_df['word'].str.contains('à|è|ì|ò|ù|ỳ|ằ|ầ|ề|ồ|ờ|ừ')).astype(int)
示例#17
0
from schema import Line
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal as sg
from scipy import stats as st

plt.style.use('ggplot')

import amfm_decompy.pYAAPT as pYAAPT
import amfm_decompy.basic_tools as basic

lines = Line.select().where(Line.id << sys.argv[1:])

for line in lines:
    signal = basic.SignalObj(line.audio)
    pitch = pYAAPT.yaapt(signal)

    t = pitch.frames_pos / signal.fs
    p = pitch.samp_interp
    p[p == 0] = np.nan

    fig, (a,b) = plt.subplots(1, 2, sharey=True,
                              num="#%i: " % (line.id) \
                                 + line.text.replace('\n', ' '),
                              figsize=(10,5),
                              )

    #a.plot(t, p, lw=1)
    a.set_ylim(50, 400)
    a.set_xlim(right=np.max(t))
示例#18
0
import preprocessing
import feature_extraction as fea
import scipy

path = '..\\..\\boy_and_girl\\class1\\arctic_a0001.wav'
audio_data, sample_rate = lib.load(
    path, sr=None, mono=True, res_type='kaiser_best')  # 读取文件
silence_remove = preprocessing.silence_remove(
    x=audio_data,
    limit=np.max(audio_data) / 20 * 2,
    fs=sample_rate,
    option='HF',
    # pic=savepic + '\\' + 'silence_remove_hilbert_' + str(j)+'_'+str(i))
    pic=None)

signal = basic.SignalObj(silence_remove, sample_rate)

frame_time = 30.0
frame_length = int(0.03 * sample_rate)
frame_overlap = frame_length // 2 + 1
params = {'frame_length': frame_time,
          'tda_frame_length': frame_time,
          'frame_space': frame_time/2,
          'f0_min': 50.0,
          'f0_max': 1000.0,
          'fft_length': 8192,
          'bp_forder': 150,            # 带通滤波器阶数
          'bp_low': 50.0,
          'bp_hign': 1500.0,
          'nlfer_thresh1': 0.75,    # 0.75
          'nlfer_thresh2': 0.1,
示例#19
0
print("f0_to_pac started")
f0_fnames = utils.get_file_list(directory, '.f0_ascii')
with open(directory + 'dump.txt', 'w') as dumpfile:
    for fname in f0_fnames:
        fuj_utils.convert_f0_ascii_to_pac(fname, autofuji_fname, directory)
        # thresh = 0.0001
        # alpha = 2.0
        # args ="{} 0 4 {} auto {}".format(directory+fname, thresh, alpha)
        # subprocess.call(autofuji_fname+" "+args)
        print("{} f0_to_pac completed".format(fname))
print("f0_to_pac completed")

# Declare the variables.
file_name = "Ses01F_script01_1_M035.wav"
# Create the signal object.
signal = basic.SignalObj(directory + file_name)
# Get time interval and num_samples
t_start = 0.0
num_samples = signal.size
t_end = num_samples / signal.fs
t = np.linspace(t_start, t_end, num_samples)
# Create the pitch object and calculate its attributes.
pitch = pyaapt.yaapt(signal)

with open('Ses01F_script01_1_M035.f0_ascii', 'wb') as f:
    for i in range(pitch.nframes):
        f0 = pitch.samp_values[i]
        vu = 1.0 if pitch.vuv[i] else 0.0
        fe = pitch.energy[i] * pitch.mean_energy
        line = '{} {} {} {}\n'.format(f0, vu, fe, vu)
        f.write(line)
示例#20
0
# matplotlib.interactive(True)

# y, sr = librosa.load(librosa.util.example_audio_file(), duration=10)
y, sr = librosa.core.load(audio_filesd[1], sr=44100, duration=10)
print(audio_filesd[0])

D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
# plt.subplot(4, 2, 1)
librosa.display.specshow(D, y_axis='linear')
plt.colorbar(format='%+2.0f dB')
plt.title('Linear-frequency power spectrogram')

# signal = basic.SignalObj('../')

filename = '../wav_temp/1268690716832_48601.wav'
signal = basic.SignalObj('../wav_temp/1268690716832_48601.wav')

downsample = 1
samplerate = 0
win_s = 1764 // downsample  # fft size
hop_s = 441 // downsample  # hop size
s = source(filename, samplerate, hop_s)
samplerate = s.samplerate
tolerance = 0.8
pitch_o = pitch("yin", win_s, hop_s, samplerate)
pitch_o.set_unit("midi")
pitch_o.set_tolerance(tolerance)
pitchesYIN = []
confidences = []
total_frames = 0
while True:
示例#21
0
def yaapt(signal, **kwargs):

    #---------------------------------------------------------------
    # Set the default values for the parameters.
    #---------------------------------------------------------------
    parameters = {}
    parameters['frame_length'] = kwargs.get('frame_length', 25.0)   #Length of each analysis frame (ms)
    parameters['frame_space'] = kwargs.get('frame_space', 10.0)     #Spacing between analysis frames (ms)
    parameters['f0_min'] = kwargs.get('f0_min', 60.0)               #Minimum F0 searched (Hz)
    parameters['f0_max'] = kwargs.get('f0_max', 400.0)              #Maximum F0 searched (Hz)
    parameters['fft_length'] = kwargs.get('fft_length', 8192)       #FFT length
    parameters['bp_forder'] = kwargs.get('bp_forder', 150)          #Order of band-pass filter
    parameters['bp_low'] = kwargs.get('bp_low', 50.0)               #Low frequency of filter passband (Hz)
    parameters['bp_high'] = kwargs.get('bp_high', 1500.0)           #High frequency of filter passband (Hz)
    parameters['nlfer_thresh1'] = kwargs.get('nlfer_thresh1', 0.75) #NLFER boundary for voiced/unvoiced decisions
    parameters['nlfer_thresh2'] = kwargs.get('nlfer_thresh2', 0.1)  #Threshold for NLFER definitely unvoiced
    parameters['shc_numharms'] = kwargs.get('shc_numharms', 3)      #Number of harmonics in SHC calculation
    parameters['shc_window'] = kwargs.get('shc_window', 40.0)       #SHC window length (Hz)
    parameters['shc_maxpeaks'] = kwargs.get('shc_maxpeaks', 4)      #Maximum number of SHC peaks to be found
    parameters['shc_pwidth'] = kwargs.get('shc_pwidth', 50.0)       #Window width in SHC peak picking (Hz)
    parameters['shc_thresh1'] = kwargs.get('shc_thresh1', 5.0)      #Threshold 1 for SHC peak picking
    parameters['shc_thresh2'] = kwargs.get('shc_thresh2', 1.25)     #Threshold 2 for SHC peak picking
    parameters['f0_double'] = kwargs.get('f0_double', 150.0)        #F0 doubling decision threshold (Hz)
    parameters['f0_half'] = kwargs.get('f0_half', 150.0)            #F0 halving decision threshold (Hz)
    parameters['dp5_k1'] = kwargs.get('dp5_k1', 11.0)               #Weight used in dynamic program
    parameters['dec_factor'] = kwargs.get('dec_factor', 1)          #Factor for signal resampling
    parameters['nccf_thresh1'] = kwargs.get('nccf_thresh1', 0.25)   #Threshold for considering a peak in NCCF
    parameters['nccf_thresh2'] = kwargs.get('nccf_thresh2', 0.9)    #Threshold for terminating serach in NCCF
    parameters['nccf_maxcands'] = kwargs.get('nccf_maxcands', 3)    #Maximum number of candidates found
    parameters['nccf_pwidth'] = kwargs.get('nccf_pwidth', 5)        #Window width in NCCF peak picking
    parameters['merit_boost'] = kwargs.get('merit_boost', 0.20)     #Boost merit
    parameters['merit_pivot'] = kwargs.get('merit_pivot', 0.99)     #Merit assigned to unvoiced candidates in
                                                                    #defintely unvoiced frames
    parameters['merit_extra'] = kwargs.get('merit_extra', 0.4)      #Merit assigned to extra candidates
                                                                    #in reducing F0 doubling/halving errors
    parameters['median_value'] = kwargs.get('median_value', 7)      #Order of medial filter
    parameters['dp_w1'] = kwargs.get('dp_w1', 0.15)                 #DP weight factor for V-V transitions
    parameters['dp_w2'] = kwargs.get('dp_w2', 0.5)                  #DP weight factor for V-UV or UV-V transitions
    parameters['dp_w3'] = kwargs.get('dp_w3', 0.1)                  #DP weight factor of UV-UV transitions
    parameters['dp_w4'] = kwargs.get('dp_w4', 0.9)                  #Weight factor for local costs

    #---------------------------------------------------------------
    # Create the signal objects and filter them.
    #---------------------------------------------------------------
    fir_filter = BandpassFilter(signal.fs, parameters)
    nonlinear_sign = basic.SignalObj(signal.data**2, signal.fs)

    signal.filtered_version(fir_filter)
    nonlinear_sign.filtered_version(fir_filter)

    #---------------------------------------------------------------
    # Create the pitch object.
    #---------------------------------------------------------------
    nfft = parameters['fft_length']
    frame_size = int(np.fix(parameters['frame_length']*signal.fs/1000))
    frame_jump = int(np.fix(parameters['frame_space']*signal.fs/1000))
    pitch = PitchObj(frame_size, frame_jump, nfft)

    if pitch.frame_size < 15:
        print 'Frame length value {} is too short.'.format(nframe_size)
        interrupt_main()
    elif pitch.frame_size > 2048:
        print 'Frame length value {} exceeds the limit.'.format(nframe_size)
        interrupt_main()

    #---------------------------------------------------------------
    # Calculate NLFER and determine voiced/unvoiced frames.
    #---------------------------------------------------------------
    nlfer(signal, pitch, parameters)

    #---------------------------------------------------------------
    # Calculate an approximate pitch track from the spectrum.
    #---------------------------------------------------------------
    spec_pitch, pitch_std = spec_track(nonlinear_sign, pitch, parameters)

    #---------------------------------------------------------------
    # Temporal pitch tracking based on NCCF.
    #---------------------------------------------------------------
    time_pitch1, time_merit1 = time_track(signal, spec_pitch, pitch_std, pitch,
                                          parameters)

    time_pitch2, time_merit2 = time_track(nonlinear_sign, spec_pitch, pitch_std,
                                          pitch, parameters)

    #---------------------------------------------------------------
    # Refine pitch candidates.
    #---------------------------------------------------------------
    ref_pitch, ref_merit = refine(time_pitch1, time_merit1, time_pitch2,
                                  time_merit2, spec_pitch, pitch, parameters)

    #---------------------------------------------------------------
    # Use dyanamic programming to determine the final pitch.
    #---------------------------------------------------------------
    final_pitch = dynamic(ref_pitch, ref_merit, pitch, parameters)

    pitch.set_values(final_pitch, signal.size)

    return pitch
示例#22
0
11/Mar/2020 Bernardo J.B. Schmitt - [email protected]
"""
import amfm_decompy
import amfm_decompy.pYAAPT as pyaapt
import amfm_decompy.pyQHM as pyqhm
import amfm_decompy.basic_tools as basic
import os.path

# Declare the variables.
file_name = os.path.dirname(amfm_decompy.__file__) + os.sep + "sample.wav"
window_duration = 0.015  # in seconds
nharm_max = 25
SNR = float('Inf')

# Create the signal object.
signal = basic.SignalObj(file_name)

# Create the window object.
window = pyqhm.SampleWindow(window_duration, signal.fs)

# Create the pitch object and calculate its attributes.
pitch = pyaapt.yaapt(signal)

# Set the number of modulated components.
signal.set_nharm(pitch.values, nharm_max)

# Check if gaussian noise has to be added.
if SNR != float('Inf'):
    signal.noiser(pitch.values, SNR)

# Perform the QHM extraction.
示例#23
0
def preparePitchFeatureVector(filename):
    ## Find a better way to get pitches for our audio clip
    ## Function Prototype : pysptk.sptk.rapt(x, fs, hopsize, min=60, max=240, voice_bias=0.0, otype='f0')
    ## The current is removing zeroed (unvoiced) pitches
    ## Still have to understand voiced and unvoiced data from audio
    # pitches = pysptk.sptk.rapt(x=soundData, fs=sampleRate, hopsize=totalFrames, min=60, max=250, voice_bias=0.5, otype="f0")
    # pitches = pitches[np.nonzero(pitches)]
    '''-----------------------------------------------------PITCH FEATURES-----------------------------------------------------'''
    ## Example : pitch = pYAAPT.yaapt(signal, **{'f0_min' : 150.0, 'frame_length' : 15.0, 'frame_space' : 5.0})
    signal = basic.SignalObj(filename)
    pitch = pYAAPT.yaapt(
        signal, **{
            'f0_min': 60.0,
            'f0_max': 360,
            'frame_length': 25.0,
            'frame_space': 10.0
        })
    pitches = pitch.samp_values

    testSampleRate, testSoundData = wav.read(filename)

    getPitchesForEachFrame(testSoundData, testSampleRate)

    ## Getting the voiced part of the sound clip
    boolVoiced = np.array([])

    for i in range(len(pitches)):
        if (pitches[i] == 0):
            boolVoiced = np.append(boolVoiced, 0)
        else:
            boolVoiced = np.append(boolVoiced, 1)

    derivativeOfPitches = np.array([])

    counter = 0
    for i in pitches:
        if counter == 0:
            delta = i
            derivativeOfPitches = np.append(derivativeOfPitches, delta)
        else:
            delta = i - prev
            derivativeOfPitches = np.append(derivativeOfPitches, delta)
        prev = i
        counter += 1

    derivativeOfPitches = np.array(
        np.split(derivativeOfPitches, pitches.shape[0]))

    # print(boolVoiced)

    # sampleRate, soundData = wav.read(filename)
    # plt.subplot(2, 2, 1)
    # plt.plot(soundData)
    # plt.subplot(2, 2, 3)
    # plt.plot(pitch.samp_values)
    # plt.subplot(2, 2, 4)
    # plt.plot(boolVoiced)
    # plt.subplot(2, 2, 2)
    # plt.plot(voicedData(filename))
    # plt.show()
    # print("Pitches frames", pitches.shape)

    pitchFeatureVector = np.array([])

    ## soundData statistics
    ## Features 0 to 4
    pitchFeatureVector = np.append(pitchFeatureVector, np.mean(pitches))
    pitchFeatureVector = np.append(pitchFeatureVector, np.median(pitches))
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.max(pitches[np.nonzero(pitches)]))
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.min(pitches[np.nonzero(pitches)]))
    pitchFeatureVector = np.append(pitchFeatureVector, np.var(pitches))

    ## soundData Derivative statistics
    ## Features 5 to 9
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.mean(derivativeOfPitches))
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.median(derivativeOfPitches))
    pitchFeatureVector = np.append(
        pitchFeatureVector,
        np.max(derivativeOfPitches[np.nonzero(derivativeOfPitches)]))
    pitchFeatureVector = np.append(
        pitchFeatureVector,
        np.min(derivativeOfPitches[np.nonzero(derivativeOfPitches)]))
    pitchFeatureVector = np.append(pitchFeatureVector,
                                   np.var(derivativeOfPitches))

    # print(pitchFeatureVector.shape)
    '''-----------------------------------------------------PITCH FEATURES-----------------------------------------------------'''
    '''-----------------------------------------------------AVERAGE ENERGIES-----------------------------------------------------'''
    sr, sd = wav.read(filename)
    # boolVoiced = voicedData(sd, sr)

    frame = 50  #ms

    ## Finding the frame length for sound corresponding to the time frames
    frame = int(sr * (frame / 1000))

    ## Finding the total number of frames of our sound
    totalFrames = sd.shape[0] / frame
    framesToCut = int(frame * floor(totalFrames))

    ## Padding the overflowing
    valuesToPad = sd.shape[0] - framesToCut
    soundData = np.pad(sd, (0, frame - valuesToPad), 'constant')
    totalFrames = soundData.shape[0] / frame

    soundData = np.array(np.split(soundData, totalFrames))
    # soundData = soundData[:-1]

    ## Average energies of soundData
    ## Have to ratio it according to voiced and unvoiced data
    ## Function prototype :  librosa.feature.rmse(y=None, S=None, frame_length=2048, hop_length=512, center=True, pad_mode='reflect)
    # averageEnergies = np.mean(librosa.feature.rmse(y=soundData, hop_length=int(totalFrames), center=True, pad_mode='reflect').T, axis=0)[0]
    voicedEnergies = np.array([])
    unvoicedEnergies = np.array([])
    '''
    for i in range(boolVoiced.shape[0]):
        if (boolVoiced[i] == 0):
            unvoicedEnergies = np.append(unvoicedEnergies, AFE.stEnergy(soundData[i]))
        else:
            voicedEnergies = np.append(voicedEnergies, AFE.stEnergy(soundData[i]))
 
    ## Feature 10
    # print("voicedEnergies", voicedEnergies)
    voicedEnergies = np.mean(voicedEnergies)
    pitchFeatureVector = np.append(pitchFeatureVector, voicedEnergies)
    ## Feature 11
    unvoicedEnergies = np.mean(unvoicedEnergies)
    # print("unvoicedEnergies", voicedEnergies)
    pitchFeatureVector = np.append(pitchFeatureVector, unvoicedEnergies)
    ## Checking for NaN values
    if (np.isnan(pitchFeatureVector[11])):
        pitchFeatureVector[11] = 0
 
    #-----------------------------------------------------AVERAGE ENERGIES-----------------------------------------------------
 
    #------------------------------------------------------SPEAKING RATE------------------------------------------------------
    ## Speaking rate of soundData (inverse of the average length of the voiced part of an utterance)
    voicedParts = np.array([])
    # print(boolVoiced)
 
    LENGTH = 0
    for i in range(boolVoiced.shape[0]):
        if (boolVoiced[i] == 1):
            LENGTH += 1
        elif (LENGTH > 0 and boolVoiced[i] == 0):
            ## 50 because thats the frame length we are going with.
            voicedParts = np.append(voicedParts, LENGTH*(50/1000))
            LENGTH = 0
     
    if (LENGTH != 0):
        voicedParts = np.append(voicedParts, LENGTH*(50/1000))
        LENGTH = 0
 
 
    # print(voicedParts)
    ## Speaking rate made out to be in words per second
    # print(boolVoiced)     
    speakingRate = 1 / (np.mean(voicedParts))
    # print("Speaking rate :", speakingRate)
    ## Feature 12
    pitchFeatureVector = np.append(pitchFeatureVector, speakingRate)
    #------------------------------------------------------SPEAKING RATE------------------------------------------------------
    '''

    return pitchFeatureVector