def generate_filter_banks(band_sizes): band_sizes = sorted(band_sizes) total_samples = band_sizes[-1] n_bands = [128] * 5 n_taps = 256 current_low_freq = 20 for i, size in enumerate(band_sizes): ratio = (total_samples / size) new_sr = zounds.SampleRate(sr.frequency * ratio, sr.duration * ratio) if size == total_samples: freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist - 20) else: freq_band = zounds.FrequencyBand(current_low_freq, new_sr.nyquist) bandpass = firwin(n_taps, [int(new_sr) // 4, (int(new_sr) // 2) - 1], fs=int(new_sr), pass_zero=False).astype(np.float32) bandpass = torch.from_numpy(bandpass).to(device).view(1, 1, n_taps) scale = zounds.GeometricScale(freq_band.start_hz, freq_band.stop_hz, 0.05, n_bands[i]) bank = zounds.learn.FilterBank( new_sr, n_taps, scale, # values close to zero get good frequency resolution. Values close # to one get good time resolution 0.25, normalize_filters=False, a_weighting=False).to(device) current_low_freq = freq_band.stop_hz yield bank, bandpass
import featureflow as ff import numpy as np import zounds from torch import nn from torch import optim import argparse from multiprocessing.pool import ThreadPool, cpu_count samplerate = zounds.SR11025() BaseModel = zounds.stft(resample_to=samplerate, store_fft=True) scale = zounds.GeometricScale(start_center_hz=300, stop_center_hz=3040, bandwidth_ratio=0.07496, n_bands=64) scale.ensure_overlap_ratio(0.5) @zounds.simple_lmdb_settings('speeches', map_size=1e10, user_supplied_id=True) class Sound(BaseModel): """ An audio processing pipeline that computes a frequency domain representation of the sound that follows a geometric scale """ bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands, samplerate=samplerate, stop_freq_hz=samplerate.nyquist, needs=BaseModel.fft, store=True) long_windowed = zounds.ArrayWithUnitsFeature(
start = zounds.Milliseconds(500) end = start + zounds.Seconds(2) snippet = sound.weighted[start:end, :] # grab a subset of frequency information for the duration of the sound freq_band = slice(zounds.Hertz(400), zounds.Hertz(500)) a440 = sound.mdct[:, freq_band] # produce a new set of coefficients where only the 440hz sine wave is # present filtered = sound.mdct.zeros_like() filtered[:, freq_band] = a440 # apply a geometric scale, which more closely matches human pitch # perception, and apply it to the linear frequency axis scale = zounds.GeometricScale(50, 4000, 0.05, 100) log_coeffs = scale.apply(sound.mdct, zounds.HanningWindowingFunc()) # reconstruct audio from the MDCT coefficients mdct_synth = zounds.MDCTSynthesizer() reconstructed = mdct_synth.synthesize(sound.mdct) filtered_reconstruction = mdct_synth.synthesize(filtered) # start an in-browser REPL that will allow you to listen to and visualize # the variables defined above (and any new ones you create in the session) app = zounds.ZoundsApp(model=Sound, audio_feature=Sound.ogg, visualization_feature=Sound.weighted, globals=globals(), locals=locals()) app.start(9999)
""" import numpy as np import zounds from zounds.spectral import apply_scale samplerate = zounds.SR11025() BaseModel = zounds.resampled(resample_to=samplerate, store_resampled=True) scale_bands = 96 spectrogram_duration = 64 anchor_slice = slice(spectrogram_duration, spectrogram_duration * 2) scale = zounds.GeometricScale(start_center_hz=50, stop_center_hz=samplerate.nyquist, bandwidth_ratio=0.115, n_bands=scale_bands) scale.ensure_overlap_ratio() spectrogram_duration = 64 windowing_scheme = zounds.HalfLapped() spectrogram_sample_rate = zounds.SampleRate( frequency=windowing_scheme.frequency * (spectrogram_duration // 2), duration=windowing_scheme.frequency * spectrogram_duration) def spectrogram(x): x = apply_scale(np.abs(x.real), scale, window=zounds.OggVorbisWindowingFunc())
See section 3.3 Setting MDCT Sizes for information about what we're fudging/ glossing over in this implementation. We instead use the DCT2 transform, which makes inversion easier, at the cost of more redundancy. """ from __future__ import division import zounds import scipy samplerate = zounds.SR11025() BaseModel = zounds.stft(resample_to=samplerate) windowing_func = zounds.OggVorbisWindowingFunc() scale = zounds.GeometricScale(300, 3030, 0.05, 100) @zounds.simple_in_memory_settings class Document(BaseModel): bark = zounds.ArrayWithUnitsFeature(zounds.BarkBands, samplerate=samplerate, stop_freq_hz=samplerate.nyquist, needs=BaseModel.fft, store=True) long_windowed = zounds.ArrayWithUnitsFeature( zounds.SlidingWindow, wscheme=zounds.SampleRate(frequency=zounds.Milliseconds(500), duration=zounds.Seconds(1)), wfunc=windowing_func,
@classmethod def from_audio(cls, samples, samplerate): coeffs = cls.batch_stft(samples) mag = np.abs(coeffs) coeffs = cls._embed(mag) coeffs = coeffs.transpose((0, 2, 1)) coeffs = np.log(coeffs + 1e-12) coeffs = cls._postprocess_coeffs(coeffs) return cls(coeffs, samplerate) sr = zounds.SR11025() n_bands = 256 mel_scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), n_bands) geom_scale = zounds.GeometricScale(20, sr.nyquist - 20, 0.05, n_bands) linear_scale = zounds.LinearScale(zounds.FrequencyBand(0, sr.nyquist), 513) mel_scale_basis = mel_scale._basis(linear_scale, zounds.HanningWindowingFunc()) geom_scale_basis = geom_scale._basis(linear_scale, zounds.HanningWindowingFunc()) class MelScalePhaseRecover(BasePhaseRecovery): basis = mel_scale_basis def __init__(self, data, samplerate): super().__init__(data, samplerate) class GeometricScalePhaseRecover(BasePhaseRecovery): basis = geom_scale_basis