示例#1
0
def _get_mfcc_log_spec_and_log_mel_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length):
    # Pre-emphasis
    y_preem = preemphasis(wav, coeff=preemphasis_coeff)

    # Get spectrogram
    D = librosa.stft(y=y_preem, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
    mag = np.abs(D)

    # Get mel-spectrogram
    mel_basis = librosa.filters.mel(hp_default.sr, hp_default.n_fft, hp_default.n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag ** 1)  # (n_mels, t) # mel spectrogram

    # Get mfccs
    db = librosa.power_to_db(mel)
    mfccs = np.dot(librosa.filters.dct(hp_default.n_mfcc, db.shape[0]), mel)

    # Log
    mag = np.log(mag + sys.float_info.epsilon)
    mel = np.log(mel + sys.float_info.epsilon)

    # Normalization
    # self.y_log_spec = (y_log_spec - hp.mean_log_spec) / hp.std_log_spec
    # self.y_log_spec = (y_log_spec - hp.min_log_spec) / (hp.max_log_spec - hp.min_log_spec)

    return mfccs.T, mag.T, mel.T  # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
示例#2
0
def _get_mfcc_log_spec_and_log_mel_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length):
    # Pre-emphasis
    y_preem = preemphasis(wav, coeff=preemphasis_coeff)

    # Get spectrogram
    D = librosa.stft(y=y_preem, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
    mag = np.abs(D)

    # Get mel-spectrogram
    mel_basis = librosa.filters.mel(hp_default.sr, hp_default.n_fft, hp_default.n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag)  # (n_mels, t) # mel spectrogram

    # Get mfccs
    db = librosa.amplitude_to_db(mel)
    mfccs = np.dot(librosa.filters.dct(hp_default.n_mfcc, db.shape[0]), db)

    # Log
    mag = np.log(mag + sys.float_info.epsilon)
    mel = np.log(mel + sys.float_info.epsilon)

    # Normalization
    # self.y_log_spec = (y_log_spec - hp.mean_log_spec) / hp.std_log_spec
    # self.y_log_spec = (y_log_spec - hp.min_log_spec) / (hp.max_log_spec - hp.min_log_spec)

    return mfccs.T, mag.T, mel.T  # (t, n_mfccs), (t, 1+n_fft/2), (t, n_mels)
    def generator():
        for f, emo in data:
            f = wav_files[f]
            wav, _ = librosa.load(f, sr=hp.sr)
            wav = utils.preemphasis(wav)
            S = utils.spectrogram(wav)

            label = utils.emo2int[abbr2emo[emo]]

            yield (S.T, label)
def listen_for_speech():
    """
    Listens to Microphone, extracts phrases from it and sends it to 
    Google's TTS service and returns response. a "phrase" is sound 
    surrounded by silence (according to threshold). num_phrases controls
    how many phrases to process before finishing the listening process 
    (-1 for infinite). 
    """

    infer_model = Model('infer', logdir=None)
    infer_model.load_model('logdir/18-07-02T18-30-25/model.ckpt-444')

    #Open stream
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("* Listening mic. ")
    audio2send = []
    rel = RATE//CHUNK
    slid_win = deque(maxlen=int(SILENCE_LIMIT*rel))
    #Prepend audio from 0.5 seconds before noise was detected
    prev_audio = deque(maxlen=int(PREV_AUDIO*rel)) 
    started = False
    n = 0
    response = []


    while True:
        data = stream.read(CHUNK, exception_on_overflow=False)
        slid_win.append(math.sqrt(abs(audioop.avg(data, 4))))
        #print slid_win[-1]
        if(sum([x > THRESHOLD for x in slid_win]) > 0):
            if(not started):
                print()
                print("Starting record of phrase")
                started = True
            audio2send.append(data)
        elif (started is True):
            # The limit was reached, finish capture and deliver.
            wf = save_speech(list(prev_audio) + audio2send, n, p)

            wav, _ = librosa.load(wf, sr=RATE)
            wav = utils.preemphasis(wav)
            S = utils.spectrogram(wav)
            pred = infer_model.infer(np.array([S.T]))
            print(utils.int2emo[pred[0]])

            started = False
            slid_win.clear()
            prev_audio.clear()
            audio2send = []
            n += 1
            print( "Listening ...")
            print()
        else:
            prev_audio.append(data)

    print( "* Done recording")
    stream.close()
    p.terminate()

    return response