def demo_human_cochleagram(signal=None, sr=None, n=None): """Demo to generate the human cochleagrams, displaying various nonlinearity and downsampling options. If a signal is not provided, a tone synthesized with 40 harmonics and an f0=100 will be used. Args: signal (array, optional): Signal containing waveform data. sr (int, optional): Sampling rate of the input signal. n (int, optional): Number of filters to use in the filterbank. Returns: None """ # get a signal if one isn't provided if signal is None: signal, signal_params = make_harmonic_stack() sr = signal_params['sr'] n = signal_params['n'] else: assert sr is not None assert n is not None ### Demo Cochleagram Generation with Predefined Nonlinearities ### # no nonlinearity coch = demo_human_cochleagram_helper(signal, sr, n, nonlinearity=None) # convert to decibel coch_log = demo_human_cochleagram_helper(signal, sr, n, nonlinearity='db') # 3/10 power compression coch_pow = demo_human_cochleagram_helper(signal, sr, n, nonlinearity='power') plt.subplot(321) plt.title('Signal waveform') plt.plot(signal) plt.ylabel('amplitude') plt.xlabel('time') plt.subplot(323) plt.title('Signal Frequency Content') f, Pxx_den = welch(signal.flatten(), sr, nperseg=1024) plt.semilogy(f, Pxx_den) plt.xlabel('frequency [Hz]') plt.ylabel('PSD [V**2/Hz]') plt.subplot(322) plt.title('Cochleagram with no nonlinearity') plt.ylabel('filter #') plt.xlabel('time') utils.cochshow(np.flipud(coch), interact=False) plt.gca().invert_yaxis() plt.subplot(324) plt.title('Cochleagram with nonlinearity: "log"') plt.ylabel('filter #') plt.xlabel('time') utils.cochshow(np.flipud(coch_log), interact=False) plt.gca().invert_yaxis() plt.subplot(326) plt.title('Cochleagram with nonlinearity: "power"') plt.ylabel('filter #') plt.xlabel('time') utils.cochshow(np.flipud(coch_pow), interact=False) plt.gca().invert_yaxis() plt.tight_layout() ### Demo Cochleagram Generation with Downsampling ### plt.figure() # no downsampling # cochd = demo_human_cochleagram_helper(signal, sr, n, downsample=None) # predefined polyphase resampling with upsample factor = 10000, downsample factor = `sr` cochd_poly = demo_human_cochleagram_helper(signal, sr, n, downsample=10000) # custom downsampling function to use decimate with a downsampling factor of 2 custom_downsample_fx = lambda x: decimate( x, 2, axis=1, ftype='fir', zero_phase=True) cochd_decimate = demo_human_cochleagram_helper( signal, sr, n, downsample=custom_downsample_fx) plt.subplot(221) plt.title('Signal waveform') plt.plot(signal) plt.ylabel('amplitude') plt.xlabel('time') plt.subplot(223) plt.title('Signal Frequency Content') f, Pxx_den = welch(signal.flatten(), sr, nperseg=1024) plt.semilogy(f, Pxx_den) plt.xlabel('frequency [Hz]') plt.ylabel('PSD [V**2/Hz]') plt.subplot(222) plt.title('Cochleagram with 2x default\n(polyphase) downsampling') plt.ylabel('filter #') plt.xlabel('time') utils.cochshow(np.flipud(cochd_poly), interact=False) plt.gca().invert_yaxis() plt.subplot(224) plt.title('Cochleagram with 2x custom\n(decimate) downsampling') plt.ylabel('filter #') plt.xlabel('time') utils.cochshow(np.flipud(cochd_decimate), interact=False) plt.gca().invert_yaxis() plt.tight_layout() plt.show()
def evaluate_cochleogram(model, n_gamma_model, n_gama_coh, low_lim, hi_lim, sample_rate, eval_filename, librosa_filename, fr_len, synthesis_file): # evaluate model model.eval() eval_filename_spect_only = eval_filename + '_spect_only.png' eval_filename_all = eval_filename + '_time_spect.png' audio, sr = librosa.load('got_s2e9_cake.wav') #audio, sr = librosa.load('abba.wav') duration = librosa.get_duration(y=audio, sr=sr) len = audio.size nb_frames = len // fr_len new_array = np.resize(audio, [nb_frames, fr_len]) coch_pow_batches = cgram.cochleagram(signal=new_array, sr=sample_rate, n=n_gama_coh, low_lim=low_lim, hi_lim=hi_lim, sample_factor=2, padding_size=None, downsample=None, nonlinearity='power', fft_mode='auto', ret_mode='envs', strict=False) coch_pow = coch_pow_batches[0, :, :] for fr in range(1, coch_pow_batches.shape[0]): coch_pow = np.append(coch_pow, coch_pow_batches[fr, :, :], axis=1) coch_pow_all_at_once = cgram.cochleagram(signal=audio, sr=sample_rate, n=n_gama_coh, low_lim=low_lim, hi_lim=hi_lim, sample_factor=2, padding_size=None, downsample=None, nonlinearity='power', fft_mode='auto', ret_mode='envs', strict=False) analytic_subband_signal, env_sb = cgram.cochleagram(signal=audio, sr=sample_rate, n=n_gama_coh, low_lim=low_lim, hi_lim=hi_lim, sample_factor=2, padding_size=None, downsample=None, nonlinearity=None, fft_mode='auto', ret_mode='analytic', strict=False) new_env = cgram.apply_envelope_nonlinearity(analytic_subband_signal, nonlinearity='power') img = np.flipud( coch_pow ) # the cochleagram is upside down (i.e., in image coordinates) signal = audio plt.figure(figsize=(8, 5)) plt.subplot(221) plt.title('Input Time Signal') plt.plot(signal) plt.ylabel('Amplitude') plt.xlabel('Time (Samples)') plt.subplot(222) plt.title('Cochleagram (FFT-based)') plt.ylabel('Filter Nb') plt.xlabel('Time (Samples)') utils.cochshow(np.flipud(coch_pow_all_at_once), interact=False) plt.gca().invert_yaxis() plt.tight_layout() # convert to tensor to call model # =============================== audio_or = audio audio = torch.FloatTensor(audio) audio = audio.unsqueeze(0) audio = audio.to(device) output_real, output_imag, output_eval, power_frames_eval, recovered_signal = model( audio) p_frames = power_frames_eval.cpu().data.numpy() plt.subplot(224) plt.title('Cochleagram (Model Output)') plt.ylabel('Filter Nb') plt.xlabel('Time (Samples)') utils.cochshow(np.flipud(p_frames), interact=False) plt.gca().invert_yaxis() plt.tight_layout() syn_frames = recovered_signal.cpu().data.numpy() write(synthesis_file, rate=sample_rate, data=syn_frames) plt.subplot(223) plt.title('Re-Synthesized Signal ') plt.plot(syn_frames) plt.ylabel('Amplitude') plt.xlabel('Time (Samples)') plt.savefig(eval_filename_all) plt.figure(figsize=(10, 4)) plt.subplot(211) plt.title('Cochleagram (FFT-based)') plt.ylabel('Filter Nb') plt.xlabel('Time (Samples)') utils.cochshow(np.flipud(coch_pow_all_at_once), interact=False) plt.gca().invert_yaxis() plt.tight_layout() plt.subplot(212) plt.title('Cochleagram (Model Output)') plt.ylabel('Filter Nb') plt.xlabel('Time (Samples)') utils.cochshow(np.flipud(p_frames), interact=False) plt.gca().invert_yaxis() plt.tight_layout() plt.savefig(eval_filename_spect_only) len = min(p_frames.shape[1], coch_pow_all_at_once.shape[1]) coc_power_err = np.mean( (p_frames[:, :len] - coch_pow_all_at_once[:, :len])**2.0) print(f' Difference between cochleogram (all at once) = {coc_power_err}') len = min(p_frames.shape[1], coch_pow.shape[1]) coc_power_err2 = np.mean((p_frames[:, :len] - coch_pow[:, :len])**2.0) print(f' Difference between cochleogram (frame based) = {coc_power_err2}') len = min(syn_frames.shape[0], audio_or.shape[0]) print(f' shape {syn_frames.shape}, {audio_or.shape}') reconst_time_err = np.mean((syn_frames[:len] - audio_or[:len])**2.0) print(f' reconstruction error = {reconst_time_err}') plt.figure(figsize=(10, 4)) plt.subplot(1, 2, 1) utils.cochshow((coch_pow), interact=False) plt.colorbar() plt.title('pycho - Power') # plot the librosa chroma plt.subplot(1, 2, 2) utils.cochshow((p_frames), interact=False) plt.colorbar() plt.title('Model out ') plt.savefig(librosa_filename)
def demo_invert_cochleagram(signal=None, sr=None, n=None, playback=False): """Demo that will generate a cochleagram from a signal, then invert this cochleagram to produce a waveform signal. Args: signal (array, optional): Signal containing waveform data. sr (int, optional): Sampling rate of the input signal. n (int, optional): Number of filters to use in the filterbank. playback (bool, optional): Determines if audio signals will be played (using pyaudio). If False, only plots will be created. If True, the original signal and inverted cochleagram signal will be played. NOTE: Be careful with the volume when using playback, things can get *very loud*. Returns: None """ # get a signal if one isn't provided if signal is None: signal, signal_params = make_harmonic_stack() sr = signal_params['sr'] n = signal_params['n'] low_lim = signal_params['low_lim'] hi_lim = signal_params['hi_lim'] else: assert sr is not None assert n is not None low_lim = 50 # this is the default for cochleagram.human_cochleagram hi_lim = 20000 # this is the default for cochleagram.human_cochleagram # generate a cochleagram from the signal sample_factor = 2 # this is the default for cochleagram.human_cochleagram coch = demo_human_cochleagram_helper(signal, sr, n, sample_factor=sample_factor) print('Generated cochleagram with shape: ', coch.shape) # invert the cochleagram to get a signal coch = np.flipud( coch) # the ouput of demo_human_cochleagram_helper is flipped inv_coch_sig, inv_coch = cgram.invert_cochleagram(coch, sr, n, low_lim, hi_lim, sample_factor, n_iter=10, strict=False) print('Generated inverted cochleagram') print('Original signal shape: %s, Inverted cochleagram signal shape: %s' % (signal.shape, inv_coch_sig.shape)) plt.subplot(211) plt.title('Cochleagram of original signal') utils.cochshow(coch, interact=False) # this signal is already flipped plt.ylabel('filter #') plt.xlabel('time') plt.gca().invert_yaxis() plt.subplot(212) plt.title('Cochleagram of inverted signal') utils.cochshow(inv_coch, interact=False) # this signal needs to be flipped plt.ylabel('filter #') plt.xlabel('time') plt.gca().invert_yaxis() plt.tight_layout() plt.show() if playback: print('playing original signal...') utils.play_array(signal, pyaudio_params={'rate': sr}, ignore_warning=True) sleep(1) print('playing inverted cochleagram signal...') utils.play_array(inv_coch_sig, pyaudio_params={'rate': sr}, ignore_warning=True)
def main(ignore_playback_warning=False, mode='rand_sound'): """Run all demo functions. Args: ignore_playback_warning (bool, optional): To use audio playback, you must acknowledge that things can get *very loud* by setting `ignore_playback_warning` to True. mode ({'rand_sound', other}): Set the mode for the demo. If this is 'rand_sound', a sound from the demo_stim/ directory will be chosen at random and used for the demos. If this is anything else, a harmonic stack of 40 harmonics and an f0=100Hz will be generated and used. Returns: None """ mode = mode.lower() from os.path import dirname, join, realpath DEMO_PATH = join(dirname(realpath(__file__)), 'demo_stim') nb_filters = 20 if mode == 'rand_sound': rfn = choice([ os.path.join(DEMO_PATH, f) for f in os.listdir(DEMO_PATH) if f.endswith('.wav') ]) print(os.listdir(DEMO_PATH)) rfn = [os.path.join(DEMO_PATH, f) for f in os.listdir(DEMO_PATH)][2] print('Running demo with sound file: %s ' % rfn) demo_stim, demo_sr = utils.wav_to_array(rfn) demo_n = nb_filters # default filter for low_lim=50 hi_lim=20000 elif mode == 'batch': demo_stim = np.load('demo_stim/wavs_speech_n10_2s_16k.npy') demo_sr = 16000 demo_n = nb_filters # default filter for low_lim=50 hi_lim=20000 start_time = time() demo_human_cochleagram_helper(demo_stim, demo_sr, demo_n, downsample=200, nonlinearity='power') total_time = time() - start_time print('Improved Batch --> %s, %ss per coch' % (total_time, total_time / 10)) return elif mode == 'naive_batch': demo_stim = np.load('demo_stim/wavs_speech_n10_2s_16k.npy') demo_sr = 16000 demo_n = nb_filters # default filter for low_lim=50 hi_lim=20000 start_time = time() for i in range(demo_stim.shape[0]): # print('%s/%s' % (i+1, demo_stim.shape[0])) temp_signal = demo_stim[i] demo_human_cochleagram_helper(temp_signal, demo_sr, demo_n, downsample=200, nonlinearity='power') total_time = time() - start_time print('Naive Batch --> %s, %ss per coch' % (total_time, total_time / 10)) return elif mode == 'one_frame': demo_stim = np.random.uniform(-1, 1, (1, 512)) demo_sr = 16000 demo_n = nb_filters print(f'shape of demo stim = {demo_stim.shape}') else: demo_stim, demo_sr, demo_n = None, None, None print('\n### DEMO: COCHLEAGRAM GENERATION ###') print('====================================') # demo_human_cochleagram(demo_stim, demo_sr, demo_n) # call cocleogram directly with the correct parameters. #coch_pow = cgram.human_cochleagram(signal=demo_stim, sr=demo_sr, n=demo_n, sample_factor=2, downsample=None, nonlinearity='power', strict=False) len = demo_stim.size fr_len = 512 nb_frames = len // fr_len segment = demo_stim[0:fr_len] new_array = np.resize(demo_stim, [nb_frames, fr_len]) coch_pow_batches = cgram.cochleagram(signal=new_array, sr=demo_sr, n=demo_n, low_lim=50, hi_lim=demo_sr // 2, sample_factor=2, padding_size=None, downsample=None, nonlinearity='power', fft_mode='auto', ret_mode='envs', strict=False) #print(f'size of coh = {coch_pow_batches.shape}, first dimension batches = {coch_pow_batches.shape[0]} ') coch_pow = coch_pow_batches[0, :, :] #print(f'before . size of coch_pow = {coch_pow.shape}') for fr in range(1, coch_pow_batches.shape[0]): coch_pow = np.append(coch_pow, coch_pow_batches[fr, :, :], axis=1) #print(f' size of coch_pow = {coch_pow.shape}') wait = input("PRESS ENTER TO CONTINUE.") real_subband_signal = cgram.cochleagram(signal=demo_stim, sr=demo_sr, n=demo_n, low_lim=50, hi_lim=demo_sr // 2, sample_factor=2, padding_size=None, downsample=None, nonlinearity=None, fft_mode='auto', ret_mode='subband', strict=False) analytic_subband_signal = cgram.cochleagram(signal=demo_stim, sr=demo_sr, n=demo_n, low_lim=50, hi_lim=demo_sr // 2, sample_factor=2, padding_size=None, downsample=None, nonlinearity=None, fft_mode='auto', ret_mode='analytic', strict=False) new_env = cgram.apply_envelope_nonlinearity(analytic_subband_signal, nonlinearity='power') print(f'size of real subband signal = {real_subband_signal.shape}') print( f'size of analytic subband signal = {analytic_subband_signal.shape}, is real = {np.isrealobj(analytic_subband_signal)}' ) print(f'size of new env = {new_env.shape}') img = np.flipud( coch_pow ) # the cochleagram is upside down (i.e., in image coordinates) signal = demo_stim sr = demo_sr plt.figure() plt.subplot(321) plt.title('Signal waveform') plt.plot(signal) plt.ylabel('amplitude') plt.xlabel('time') plt.subplot(323) plt.title('Signal Frequency Content') f, Pxx_den = welch(signal.flatten(), sr, nperseg=1024) plt.semilogy(f, Pxx_den) plt.xlabel('frequency [Hz]') plt.ylabel('PSD [V**2/Hz]') plt.subplot(326) plt.title('Cochleagram with nonlinearity: "power"') plt.ylabel('filter #') plt.xlabel('time') print(f' shape of coch_pow is {coch_pow.shape}') utils.cochshow(np.flipud(coch_pow), interact=False) plt.gca().invert_yaxis() plt.tight_layout() plt.figure() plt.subplot(321) plt.title('subband 5, real signal') plt.plot(real_subband_signal[5]) plt.subplot(322) plt.title('subband 5, analytic real signal') plt.plot(analytic_subband_signal[5].real) plt.subplot(323) plt.title('subband 5, analytic imag signal') plt.plot(analytic_subband_signal[5].imag) plt.subplot(324) plt.title('subband 5, env of signal') plt.plot(new_env[5]) plt.show() print('\n### DEMO: AUDIO PLAYBACK ###') print('============================') #demo_playback(demo_stim, demo_sr, ignore_warning=ignore_playback_warning) print('\n### DEMO: COCHLEAGRAM INVERSION ###\n') print('===================================')