示例#1
0
def demo_human_cochleagram(signal=None, sr=None, n=None):
    """Demo to generate the human cochleagrams, displaying various nonlinearity
  and downsampling options. If a signal is not provided, a tone synthesized
  with 40 harmonics and an f0=100 will be used.

  Args:
    signal (array, optional): Signal containing waveform data.
    sr (int, optional): Sampling rate of the input signal.
    n (int, optional): Number of filters to use in the filterbank.

  Returns:
    None
  """
    # get a signal if one isn't provided
    if signal is None:
        signal, signal_params = make_harmonic_stack()
        sr = signal_params['sr']
        n = signal_params['n']
    else:
        assert sr is not None
        assert n is not None

    ### Demo Cochleagram Generation with Predefined Nonlinearities ###
    # no nonlinearity
    coch = demo_human_cochleagram_helper(signal, sr, n, nonlinearity=None)
    # convert to decibel
    coch_log = demo_human_cochleagram_helper(signal, sr, n, nonlinearity='db')
    # 3/10 power compression
    coch_pow = demo_human_cochleagram_helper(signal,
                                             sr,
                                             n,
                                             nonlinearity='power')

    plt.subplot(321)
    plt.title('Signal waveform')
    plt.plot(signal)
    plt.ylabel('amplitude')
    plt.xlabel('time')

    plt.subplot(323)
    plt.title('Signal Frequency Content')
    f, Pxx_den = welch(signal.flatten(), sr, nperseg=1024)
    plt.semilogy(f, Pxx_den)
    plt.xlabel('frequency [Hz]')
    plt.ylabel('PSD [V**2/Hz]')

    plt.subplot(322)
    plt.title('Cochleagram with no nonlinearity')
    plt.ylabel('filter #')
    plt.xlabel('time')
    utils.cochshow(np.flipud(coch), interact=False)
    plt.gca().invert_yaxis()

    plt.subplot(324)
    plt.title('Cochleagram with nonlinearity: "log"')
    plt.ylabel('filter #')
    plt.xlabel('time')
    utils.cochshow(np.flipud(coch_log), interact=False)
    plt.gca().invert_yaxis()

    plt.subplot(326)
    plt.title('Cochleagram with nonlinearity: "power"')
    plt.ylabel('filter #')
    plt.xlabel('time')
    utils.cochshow(np.flipud(coch_pow), interact=False)
    plt.gca().invert_yaxis()
    plt.tight_layout()

    ### Demo Cochleagram Generation with Downsampling ###
    plt.figure()
    # no downsampling
    # cochd = demo_human_cochleagram_helper(signal, sr, n, downsample=None)
    # predefined polyphase resampling with upsample factor = 10000, downsample factor = `sr`
    cochd_poly = demo_human_cochleagram_helper(signal, sr, n, downsample=10000)
    # custom downsampling function to use decimate with a downsampling factor of 2
    custom_downsample_fx = lambda x: decimate(
        x, 2, axis=1, ftype='fir', zero_phase=True)
    cochd_decimate = demo_human_cochleagram_helper(
        signal, sr, n, downsample=custom_downsample_fx)

    plt.subplot(221)
    plt.title('Signal waveform')
    plt.plot(signal)
    plt.ylabel('amplitude')
    plt.xlabel('time')

    plt.subplot(223)
    plt.title('Signal Frequency Content')
    f, Pxx_den = welch(signal.flatten(), sr, nperseg=1024)
    plt.semilogy(f, Pxx_den)
    plt.xlabel('frequency [Hz]')
    plt.ylabel('PSD [V**2/Hz]')

    plt.subplot(222)
    plt.title('Cochleagram with 2x default\n(polyphase) downsampling')
    plt.ylabel('filter #')
    plt.xlabel('time')
    utils.cochshow(np.flipud(cochd_poly), interact=False)
    plt.gca().invert_yaxis()

    plt.subplot(224)
    plt.title('Cochleagram with 2x custom\n(decimate) downsampling')
    plt.ylabel('filter #')
    plt.xlabel('time')
    utils.cochshow(np.flipud(cochd_decimate), interact=False)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
示例#2
0
def evaluate_cochleogram(model, n_gamma_model, n_gama_coh, low_lim, hi_lim,
                         sample_rate, eval_filename, librosa_filename, fr_len,
                         synthesis_file):
    # evaluate model
    model.eval()

    eval_filename_spect_only = eval_filename + '_spect_only.png'
    eval_filename_all = eval_filename + '_time_spect.png'

    audio, sr = librosa.load('got_s2e9_cake.wav')
    #audio, sr = librosa.load('abba.wav')
    duration = librosa.get_duration(y=audio, sr=sr)

    len = audio.size
    nb_frames = len // fr_len
    new_array = np.resize(audio, [nb_frames, fr_len])

    coch_pow_batches = cgram.cochleagram(signal=new_array,
                                         sr=sample_rate,
                                         n=n_gama_coh,
                                         low_lim=low_lim,
                                         hi_lim=hi_lim,
                                         sample_factor=2,
                                         padding_size=None,
                                         downsample=None,
                                         nonlinearity='power',
                                         fft_mode='auto',
                                         ret_mode='envs',
                                         strict=False)

    coch_pow = coch_pow_batches[0, :, :]

    for fr in range(1, coch_pow_batches.shape[0]):
        coch_pow = np.append(coch_pow, coch_pow_batches[fr, :, :], axis=1)

    coch_pow_all_at_once = cgram.cochleagram(signal=audio,
                                             sr=sample_rate,
                                             n=n_gama_coh,
                                             low_lim=low_lim,
                                             hi_lim=hi_lim,
                                             sample_factor=2,
                                             padding_size=None,
                                             downsample=None,
                                             nonlinearity='power',
                                             fft_mode='auto',
                                             ret_mode='envs',
                                             strict=False)

    analytic_subband_signal, env_sb = cgram.cochleagram(signal=audio,
                                                        sr=sample_rate,
                                                        n=n_gama_coh,
                                                        low_lim=low_lim,
                                                        hi_lim=hi_lim,
                                                        sample_factor=2,
                                                        padding_size=None,
                                                        downsample=None,
                                                        nonlinearity=None,
                                                        fft_mode='auto',
                                                        ret_mode='analytic',
                                                        strict=False)

    new_env = cgram.apply_envelope_nonlinearity(analytic_subband_signal,
                                                nonlinearity='power')

    img = np.flipud(
        coch_pow
    )  # the cochleagram is upside down (i.e., in image coordinates)

    signal = audio

    plt.figure(figsize=(8, 5))
    plt.subplot(221)
    plt.title('Input Time Signal')
    plt.plot(signal)
    plt.ylabel('Amplitude')
    plt.xlabel('Time (Samples)')

    plt.subplot(222)
    plt.title('Cochleagram (FFT-based)')
    plt.ylabel('Filter Nb')
    plt.xlabel('Time (Samples)')
    utils.cochshow(np.flipud(coch_pow_all_at_once), interact=False)
    plt.gca().invert_yaxis()
    plt.tight_layout()

    # convert to tensor to call model
    # ===============================
    audio_or = audio
    audio = torch.FloatTensor(audio)
    audio = audio.unsqueeze(0)
    audio = audio.to(device)

    output_real, output_imag, output_eval, power_frames_eval, recovered_signal = model(
        audio)

    p_frames = power_frames_eval.cpu().data.numpy()

    plt.subplot(224)
    plt.title('Cochleagram (Model Output)')
    plt.ylabel('Filter Nb')
    plt.xlabel('Time (Samples)')
    utils.cochshow(np.flipud(p_frames), interact=False)
    plt.gca().invert_yaxis()
    plt.tight_layout()

    syn_frames = recovered_signal.cpu().data.numpy()
    write(synthesis_file, rate=sample_rate, data=syn_frames)

    plt.subplot(223)
    plt.title('Re-Synthesized Signal ')
    plt.plot(syn_frames)
    plt.ylabel('Amplitude')
    plt.xlabel('Time (Samples)')

    plt.savefig(eval_filename_all)

    plt.figure(figsize=(10, 4))
    plt.subplot(211)
    plt.title('Cochleagram (FFT-based)')
    plt.ylabel('Filter Nb')
    plt.xlabel('Time (Samples)')
    utils.cochshow(np.flipud(coch_pow_all_at_once), interact=False)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.subplot(212)
    plt.title('Cochleagram (Model Output)')
    plt.ylabel('Filter Nb')
    plt.xlabel('Time (Samples)')
    utils.cochshow(np.flipud(p_frames), interact=False)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(eval_filename_spect_only)

    len = min(p_frames.shape[1], coch_pow_all_at_once.shape[1])
    coc_power_err = np.mean(
        (p_frames[:, :len] - coch_pow_all_at_once[:, :len])**2.0)
    print(f' Difference between cochleogram (all at once) = {coc_power_err}')

    len = min(p_frames.shape[1], coch_pow.shape[1])
    coc_power_err2 = np.mean((p_frames[:, :len] - coch_pow[:, :len])**2.0)
    print(f' Difference between cochleogram (frame based) = {coc_power_err2}')

    len = min(syn_frames.shape[0], audio_or.shape[0])
    print(f' shape {syn_frames.shape},  {audio_or.shape}')
    reconst_time_err = np.mean((syn_frames[:len] - audio_or[:len])**2.0)
    print(f' reconstruction error = {reconst_time_err}')

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    utils.cochshow((coch_pow), interact=False)
    plt.colorbar()
    plt.title('pycho - Power')
    # plot the librosa chroma
    plt.subplot(1, 2, 2)
    utils.cochshow((p_frames), interact=False)
    plt.colorbar()
    plt.title('Model out ')
    plt.savefig(librosa_filename)
示例#3
0
def demo_invert_cochleagram(signal=None, sr=None, n=None, playback=False):
    """Demo that will generate a cochleagram from a signal, then invert this
  cochleagram to produce a waveform signal.

  Args:
    signal (array, optional): Signal containing waveform data.
    sr (int, optional): Sampling rate of the input signal.
    n (int, optional): Number of filters to use in the filterbank.
    playback (bool, optional): Determines if audio signals will be played
      (using pyaudio). If False, only plots will be created. If True, the
      original signal and inverted cochleagram signal will be played. NOTE:
      Be careful with the volume when using playback, things can get
      *very loud*.

  Returns:
    None
  """
    # get a signal if one isn't provided
    if signal is None:
        signal, signal_params = make_harmonic_stack()
        sr = signal_params['sr']
        n = signal_params['n']
        low_lim = signal_params['low_lim']
        hi_lim = signal_params['hi_lim']
    else:
        assert sr is not None
        assert n is not None
        low_lim = 50  # this is the default for cochleagram.human_cochleagram
        hi_lim = 20000  # this is the default for cochleagram.human_cochleagram

    # generate a cochleagram from the signal
    sample_factor = 2  # this is the default for cochleagram.human_cochleagram
    coch = demo_human_cochleagram_helper(signal,
                                         sr,
                                         n,
                                         sample_factor=sample_factor)
    print('Generated cochleagram with shape: ', coch.shape)

    # invert the cochleagram to get a signal
    coch = np.flipud(
        coch)  # the ouput of demo_human_cochleagram_helper is flipped
    inv_coch_sig, inv_coch = cgram.invert_cochleagram(coch,
                                                      sr,
                                                      n,
                                                      low_lim,
                                                      hi_lim,
                                                      sample_factor,
                                                      n_iter=10,
                                                      strict=False)

    print('Generated inverted cochleagram')
    print('Original signal shape: %s, Inverted cochleagram signal shape: %s' %
          (signal.shape, inv_coch_sig.shape))

    plt.subplot(211)
    plt.title('Cochleagram of original signal')
    utils.cochshow(coch, interact=False)  # this signal is already flipped
    plt.ylabel('filter #')
    plt.xlabel('time')
    plt.gca().invert_yaxis()

    plt.subplot(212)
    plt.title('Cochleagram of inverted signal')
    utils.cochshow(inv_coch, interact=False)  # this signal needs to be flipped
    plt.ylabel('filter #')
    plt.xlabel('time')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

    if playback:
        print('playing original signal...')
        utils.play_array(signal,
                         pyaudio_params={'rate': sr},
                         ignore_warning=True)
        sleep(1)
        print('playing inverted cochleagram signal...')
        utils.play_array(inv_coch_sig,
                         pyaudio_params={'rate': sr},
                         ignore_warning=True)
示例#4
0
def main(ignore_playback_warning=False, mode='rand_sound'):
    """Run all demo functions.

  Args:
    ignore_playback_warning (bool, optional): To use audio playback, you must
      acknowledge that things can get *very loud* by setting
      `ignore_playback_warning` to True.
    mode ({'rand_sound', other}): Set the mode for the demo. If this is
      'rand_sound', a sound from the demo_stim/ directory will be chosen
      at random and used for the demos. If this is anything else, a harmonic
      stack of 40 harmonics and an f0=100Hz will be generated and used.

  Returns:
    None
  """
    mode = mode.lower()
    from os.path import dirname, join, realpath
    DEMO_PATH = join(dirname(realpath(__file__)), 'demo_stim')

    nb_filters = 20
    if mode == 'rand_sound':
        rfn = choice([
            os.path.join(DEMO_PATH, f) for f in os.listdir(DEMO_PATH)
            if f.endswith('.wav')
        ])
        print(os.listdir(DEMO_PATH))
        rfn = [os.path.join(DEMO_PATH, f) for f in os.listdir(DEMO_PATH)][2]
        print('Running demo with sound file: %s ' % rfn)
        demo_stim, demo_sr = utils.wav_to_array(rfn)
        demo_n = nb_filters  # default filter for low_lim=50 hi_lim=20000
    elif mode == 'batch':
        demo_stim = np.load('demo_stim/wavs_speech_n10_2s_16k.npy')
        demo_sr = 16000
        demo_n = nb_filters  # default filter for low_lim=50 hi_lim=20000
        start_time = time()
        demo_human_cochleagram_helper(demo_stim,
                                      demo_sr,
                                      demo_n,
                                      downsample=200,
                                      nonlinearity='power')
        total_time = time() - start_time
        print('Improved Batch --> %s, %ss per coch' %
              (total_time, total_time / 10))
        return
    elif mode == 'naive_batch':
        demo_stim = np.load('demo_stim/wavs_speech_n10_2s_16k.npy')
        demo_sr = 16000
        demo_n = nb_filters  # default filter for low_lim=50 hi_lim=20000
        start_time = time()
        for i in range(demo_stim.shape[0]):
            # print('%s/%s' % (i+1, demo_stim.shape[0]))
            temp_signal = demo_stim[i]
            demo_human_cochleagram_helper(temp_signal,
                                          demo_sr,
                                          demo_n,
                                          downsample=200,
                                          nonlinearity='power')
        total_time = time() - start_time
        print('Naive Batch --> %s, %ss per coch' %
              (total_time, total_time / 10))
        return

    elif mode == 'one_frame':
        demo_stim = np.random.uniform(-1, 1, (1, 512))
        demo_sr = 16000
        demo_n = nb_filters
        print(f'shape of demo stim = {demo_stim.shape}')
    else:
        demo_stim, demo_sr, demo_n = None, None, None

    print('\n### DEMO: COCHLEAGRAM GENERATION ###')
    print('====================================')
    # demo_human_cochleagram(demo_stim, demo_sr, demo_n)
    # call cocleogram directly with the correct parameters.

    #coch_pow = cgram.human_cochleagram(signal=demo_stim, sr=demo_sr, n=demo_n, sample_factor=2, downsample=None, nonlinearity='power', strict=False)

    len = demo_stim.size
    fr_len = 512

    nb_frames = len // fr_len
    segment = demo_stim[0:fr_len]
    new_array = np.resize(demo_stim, [nb_frames, fr_len])

    coch_pow_batches = cgram.cochleagram(signal=new_array,
                                         sr=demo_sr,
                                         n=demo_n,
                                         low_lim=50,
                                         hi_lim=demo_sr // 2,
                                         sample_factor=2,
                                         padding_size=None,
                                         downsample=None,
                                         nonlinearity='power',
                                         fft_mode='auto',
                                         ret_mode='envs',
                                         strict=False)

    #print(f'size of coh  = {coch_pow_batches.shape}, first dimension batches = {coch_pow_batches.shape[0]} ')

    coch_pow = coch_pow_batches[0, :, :]
    #print(f'before .  size of coch_pow = {coch_pow.shape}')

    for fr in range(1, coch_pow_batches.shape[0]):
        coch_pow = np.append(coch_pow, coch_pow_batches[fr, :, :], axis=1)
    #print(f' size of coch_pow = {coch_pow.shape}')
    wait = input("PRESS ENTER TO CONTINUE.")

    real_subband_signal = cgram.cochleagram(signal=demo_stim,
                                            sr=demo_sr,
                                            n=demo_n,
                                            low_lim=50,
                                            hi_lim=demo_sr // 2,
                                            sample_factor=2,
                                            padding_size=None,
                                            downsample=None,
                                            nonlinearity=None,
                                            fft_mode='auto',
                                            ret_mode='subband',
                                            strict=False)

    analytic_subband_signal = cgram.cochleagram(signal=demo_stim,
                                                sr=demo_sr,
                                                n=demo_n,
                                                low_lim=50,
                                                hi_lim=demo_sr // 2,
                                                sample_factor=2,
                                                padding_size=None,
                                                downsample=None,
                                                nonlinearity=None,
                                                fft_mode='auto',
                                                ret_mode='analytic',
                                                strict=False)

    new_env = cgram.apply_envelope_nonlinearity(analytic_subband_signal,
                                                nonlinearity='power')

    print(f'size of real subband signal = {real_subband_signal.shape}')

    print(
        f'size of analytic subband signal = {analytic_subband_signal.shape}, is real = {np.isrealobj(analytic_subband_signal)}'
    )

    print(f'size of new env = {new_env.shape}')

    img = np.flipud(
        coch_pow
    )  # the cochleagram is upside down (i.e., in image coordinates)

    signal = demo_stim
    sr = demo_sr

    plt.figure()
    plt.subplot(321)
    plt.title('Signal waveform')
    plt.plot(signal)
    plt.ylabel('amplitude')
    plt.xlabel('time')

    plt.subplot(323)
    plt.title('Signal Frequency Content')
    f, Pxx_den = welch(signal.flatten(), sr, nperseg=1024)
    plt.semilogy(f, Pxx_den)
    plt.xlabel('frequency [Hz]')
    plt.ylabel('PSD [V**2/Hz]')

    plt.subplot(326)
    plt.title('Cochleagram with nonlinearity: "power"')
    plt.ylabel('filter #')
    plt.xlabel('time')
    print(f' shape of coch_pow is {coch_pow.shape}')
    utils.cochshow(np.flipud(coch_pow), interact=False)
    plt.gca().invert_yaxis()
    plt.tight_layout()

    plt.figure()
    plt.subplot(321)
    plt.title('subband 5, real signal')
    plt.plot(real_subband_signal[5])

    plt.subplot(322)
    plt.title('subband 5, analytic real signal')
    plt.plot(analytic_subband_signal[5].real)

    plt.subplot(323)
    plt.title('subband 5, analytic imag signal')
    plt.plot(analytic_subband_signal[5].imag)

    plt.subplot(324)
    plt.title('subband 5, env of  signal')
    plt.plot(new_env[5])

    plt.show()

    print('\n### DEMO: AUDIO PLAYBACK ###')
    print('============================')
    #demo_playback(demo_stim, demo_sr, ignore_warning=ignore_playback_warning)

    print('\n### DEMO: COCHLEAGRAM INVERSION ###\n')
    print('===================================')