def __init__(self): n_mels = 128 feature_size = 32 sr = zounds.SR22050() n_fft = 1024 hop = 256 total_samples = 8192 freq_band = zounds.FrequencyBand(20, sr.nyquist - 20) n_filters = 128 filter_taps = 511 gen_scale = zounds.LinearScale(freq_band, n_filters) gen_filter_bank = zounds.learn.FilterBank(sr, filter_taps, gen_scale, 0.9, normalize_filters=True, a_weighting=False) disc_scale = zounds.LinearScale(freq_band, n_filters) disc_filter_bank = zounds.learn.FilterBank(sr, filter_taps, disc_scale, 0.9, normalize_filters=True, a_weighting=False) super().__init__(generator=ResidualStackFilterBankGenerator( gen_filter_bank, feature_size, total_samples, n_mels, add_weight_norm=True), discriminator=FilterBankDiscriminator( disc_filter_bank, total_samples, conditioning_channels=n_mels), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (sr, )), 'spectrogram': (spectrogram, (sr, )) }, total_samples=total_samples, feature_channels=n_mels, samplerate=sr, inference_sequence_factor=4)
def make_filter_bank(cls, samplerate): scale = zounds.LinearScale( zounds.FrequencyBand(20, samplerate.nyquist - 20), 128) filter_bank = zounds.learn.FilterBank(samplerate, 511, scale, 0.9, normalize_filters=True, a_weighting=False) return filter_bank
def make_filter_banks(taps, bands, sr, size): out = {} for tap, band in zip(taps, bands): # KLUDGE: Get rid of this hard-coded value if size == 8192: start = 0 else: start = sr.nyquist // 2 stop = sr.nyquist fb = zounds.FrequencyBand(start, stop) out[size] = zounds.learn.FilterBank(sr, tap, zounds.LinearScale(fb, band), 0.05, normalize_filters=True, a_weighting=False) print(size, sr, out[size].scale) size = size // 2 sr = sr * 2 return out
def __init__(self): n_mels = 128 feature_size = 32 samplerate = zounds.SR22050() n_fft = 1024 hop = 256 total_samples = 8192 n_osc = 128 scale = zounds.LinearScale( zounds.FrequencyBand(20, samplerate.nyquist - 20), n_osc) super().__init__(generator=DDSPGenerator(n_osc, feature_size, n_mels, total_samples, scale, samplerate), discriminator=MultiScaleMultiResDiscriminator( total_samples, flatten_multiscale_features=False, channel_judgements=True, conditioning_channels=n_mels, decompose=True), learning_rate=1e-4, feature_size=feature_size, audio_repr_class=RawAudio, generator_loss=mel_gan_gen_loss, sub_gen_loss=least_squares_generator_loss, discriminator_loss=mel_gan_disc_loss, sub_disc_loss=least_squares_disc_loss, g_init=weights_init, d_init=weights_init, feature_funcs={ 'audio': (audio, (samplerate, )), 'spectrogram': (spectrogram, (samplerate, )) }, total_samples=total_samples, feature_channels=n_mels, samplerate=samplerate, inference_sequence_factor=4)
def _scale(self, samplerate, bands, zero_start=False): start = 0 if zero_start else samplerate.nyquist / 2 end = samplerate.nyquist return zounds.LinearScale(zounds.FrequencyBand(start, end), bands)
@classmethod def from_audio(cls, samples, samplerate): coeffs = cls.batch_stft(samples) mag = np.abs(coeffs) coeffs = cls._embed(mag) coeffs = coeffs.transpose((0, 2, 1)) coeffs = np.log(coeffs + 1e-12) coeffs = cls._postprocess_coeffs(coeffs) return cls(coeffs, samplerate) sr = zounds.SR11025() n_bands = 256 mel_scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), n_bands) geom_scale = zounds.GeometricScale(20, sr.nyquist - 20, 0.05, n_bands) linear_scale = zounds.LinearScale(zounds.FrequencyBand(0, sr.nyquist), 513) mel_scale_basis = mel_scale._basis(linear_scale, zounds.HanningWindowingFunc()) geom_scale_basis = geom_scale._basis(linear_scale, zounds.HanningWindowingFunc()) class MelScalePhaseRecover(BasePhaseRecovery): basis = mel_scale_basis def __init__(self, data, samplerate): super().__init__(data, samplerate) class GeometricScalePhaseRecover(BasePhaseRecovery): basis = geom_scale_basis