예제 #1
0
    def __init__(self):
        feature_channels = 128
        feature_size = 32
        samplerate = zounds.SR22050()
        n_fft = 1024
        hop = 256
        total_samples = 8192

        super().__init__(generator=GroupedMDCTGenerator(feature_channels),
                         discriminator=MDCTDiscriminator(
                             MDCT.mdct_bins(),
                             feature_size,
                             conditioning_channels=feature_channels),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=MDCT,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (samplerate, )),
                             'spectrogram': (spectrogram, (samplerate, ))
                         },
                         total_samples=total_samples,
                         feature_channels=feature_channels,
                         samplerate=samplerate,
                         inference_sequence_factor=4)
예제 #2
0
    def __init__(self):
        total_samples = 8192
        samplerate = zounds.SR22050()
        n_fft = 1024
        hop = 256
        n_mels = 128
        feature_size = total_samples // hop

        super().__init__(generator=MelGanGenerator(feature_size, n_mels),
                         discriminator=MelGanDiscriminator(),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=RawAudio,
                         generator_loss=mel_gan_gen_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (samplerate, )),
                             'spectrogram':
                             (spectrogram, (samplerate, n_fft, hop, n_mels))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         samplerate=samplerate)
예제 #3
0
    def __init__(self):
        n_mels = 128
        size = 32
        samplerate = zounds.SR22050()
        n_fft = 1024
        hop = 256
        total_samples = 8192

        super().__init__(Generator(n_mels, size, n_residual_layers=3),
                         Discriminator(num_D=3,
                                       ndf=16,
                                       n_layers=4,
                                       downsampling_factor=4),
                         learning_rate=1e-4,
                         feature_size=size,
                         audio_repr_class=RawAudio,
                         generator_loss=mel_gan_gen_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (samplerate, )),
                             'spectrogram': (spectrogram, (samplerate, ))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         samplerate=samplerate,
                         inference_sequence_factor=4)
예제 #4
0
    def __init__(self):
        n_mels = 128
        feature_size = 32
        samplerate = zounds.SR22050()
        n_fft = 1024
        hop = 256
        total_samples = 8192

        super().__init__(generator=MultiScaleGenerator(n_mels,
                                                       feature_size,
                                                       total_samples,
                                                       transposed_conv=True,
                                                       recompose=True),
                         discriminator=ComplextSTFTDiscriminator(n_fft,
                                                                 hop,
                                                                 n_mels,
                                                                 do_fft=True),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=RawAudio,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (samplerate, )),
                             'spectrogram': (spectrogram, (samplerate, ))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         samplerate=samplerate,
                         inference_sequence_factor=4)
예제 #5
0
    def __init__(self):
        n_mels = 128
        feature_size = 32
        sr = zounds.SR22050()
        n_fft = 1024
        hop = 256
        total_samples = 8192

        freq_band = zounds.FrequencyBand(20, sr.nyquist - 20)
        n_filters = 128
        filter_taps = 511

        gen_scale = zounds.LinearScale(freq_band, n_filters)
        gen_filter_bank = zounds.learn.FilterBank(sr,
                                                  filter_taps,
                                                  gen_scale,
                                                  0.9,
                                                  normalize_filters=True,
                                                  a_weighting=False)

        disc_scale = zounds.LinearScale(freq_band, n_filters)
        disc_filter_bank = zounds.learn.FilterBank(sr,
                                                   filter_taps,
                                                   disc_scale,
                                                   0.9,
                                                   normalize_filters=True,
                                                   a_weighting=False)

        super().__init__(generator=ResidualStackFilterBankGenerator(
            gen_filter_bank,
            feature_size,
            total_samples,
            n_mels,
            add_weight_norm=True),
                         discriminator=FilterBankDiscriminator(
                             disc_filter_bank,
                             total_samples,
                             conditioning_channels=n_mels),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=RawAudio,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (sr, )),
                             'spectrogram': (spectrogram, (sr, ))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         samplerate=sr,
                         inference_sequence_factor=4)
예제 #6
0
def stream(batch_size=64):
    path = '/hdd/musicnet/train_data'
    pattern = '*.wav'

    samplerate = zounds.SR22050()
    feature_spec = {'spectrogram': (256, 128)}

    feature_funcs = {'spectrogram': (spectrogram, (samplerate, ))}

    bs = batch_stream(path, pattern, batch_size, feature_spec, 'spectrogram',
                      feature_funcs)
    return bs
예제 #7
0
    def __init__(self):
        n_mels = 128
        n_fft = 1024
        hop = 256
        samplerate = zounds.SR22050()
        feature_size = 32
        total_samples = 8192

        n_osc = 128
        scale = zounds.MelScale(
            zounds.FrequencyBand(20, samplerate.nyquist - 20), n_osc)

        filter_bank = zounds.learn.FilterBank(samplerate,
                                              511,
                                              scale,
                                              0.9,
                                              normalize_filters=True,
                                              a_weighting=False)

        super().__init__(generator=DDSPGenerator(n_osc=n_osc,
                                                 input_size=feature_size,
                                                 in_channels=n_mels,
                                                 output_size=total_samples,
                                                 scale=scale,
                                                 samplerate=samplerate),
                         discriminator=MultiScaleMultiResDiscriminator(
                             total_samples,
                             flatten_multiscale_features=False,
                             decompose=True,
                             channel_judgements=True,
                             conditioning_channels=n_mels),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=RawAudio,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (samplerate, )),
                             'spectrogram': (spectrogram, (samplerate, ))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         inference_sequence_factor=4,
                         samplerate=samplerate)
def stream(total_samples=8192, batch_size=32):
    path = '/hdd/musicnet/train_data'
    pattern = '*.wav'

    samplerate = zounds.SR22050()
    # total_samples = 8192
    feature_spec = {'audio': (total_samples, 1)}

    feature_funcs = {'audio': (audio, (samplerate, ))}

    # batch_size = 32
    bs = batch_stream(path, pattern, batch_size, feature_spec, 'audio',
                      feature_funcs)
    for batch, in bs:
        transformed = IdentityPhaseReovery.from_audio(batch, samplerate)
        yield batch, transformed
예제 #9
0
class FilterBankMultiscaleExperiment(Experiment):

    AUDIO_REPR_CLASS = MultiScale
    SAMPLERATE = zounds.SR22050()
    N_MELS = 128
    feature_size = 32
    total_samples = 8192

    @classmethod
    def make_generator(cls):
        return FilterBankMultiScaleGenerator(cls.SAMPLERATE,
                                             cls.N_MELS,
                                             cls.feature_size,
                                             cls.total_samples,
                                             recompose=False)

    def __init__(self):
        super().__init__(generator=FilterBankMultiScaleGenerator(
            self.SAMPLERATE,
            self.N_MELS,
            self.feature_size,
            self.total_samples,
            recompose=False),
                         discriminator=FilterBankMultiScaleDiscriminator(
                             self.total_samples,
                             self.SAMPLERATE,
                             decompose=False,
                             conditioning_channels=self.N_MELS),
                         learning_rate=1e-4,
                         feature_size=self.feature_size,
                         audio_repr_class=self.AUDIO_REPR_CLASS,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (self.SAMPLERATE, )),
                             'spectrogram': (spectrogram, (self.SAMPLERATE, ))
                         },
                         total_samples=self.total_samples,
                         feature_channels=self.N_MELS,
                         samplerate=self.SAMPLERATE,
                         inference_sequence_factor=4)
    def __init__(self):
        noise_dim = 128


        samplerate = zounds.SR22050()
        repr_class = PcaRepresentation
        vocoder = DeterministicVocoder(repr_class, samplerate)

        n_features = repr_class.pca.n_components

        def gen_loss(r_features, f_features, r_score, f_score, gan_loss):
            return least_squares_generator_loss(f_score)

        def disc_loss(r_score, f_score, gan_loss):
            return least_squares_disc_loss(r_score, f_score)

        disc_channels = 256

        super().__init__(
            vocoder=vocoder,
            feature_generator=PredictiveGenerator(),
            generator_init=weights_init,
            generator_loss=gen_loss,
            feature_disc=SpectrogramFeatureDiscriminator(
                n_features, disc_channels),
            disc_init=weights_init,
            disc_loss=disc_loss,
            feature_funcs={
                'audio': (audio, (samplerate,))
            },
            feature_spec={
                'audio': (2**16, 1)
            },
            audio_repr_class=PcaRepresentation,
            learning_rate=1e-4,
            condition_shape=(noise_dim, 1),
            samplerate=samplerate,
            anchor_feature='audio')
예제 #11
0
    def __init__(self):
        n_mels = 128
        feature_size = 32
        samplerate = zounds.SR22050()
        n_fft = 1024
        hop = 256
        total_samples = 8192

        super().__init__(generator=MultiScaleGenerator(n_mels,
                                                       feature_size,
                                                       total_samples,
                                                       transposed_conv=True,
                                                       recompose=False,
                                                       kernel_size=8),
                         discriminator=MultiScaleMultiResDiscriminator(
                             total_samples,
                             flatten_multiscale_features=False,
                             channel_judgements=True,
                             conditioning_channels=n_mels,
                             decompose=False,
                             kernel_size=9),
                         learning_rate=1e-4,
                         feature_size=feature_size,
                         audio_repr_class=MultiScale,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (samplerate, )),
                             'spectrogram': (spectrogram, (samplerate, ))
                         },
                         total_samples=total_samples,
                         feature_channels=n_mels,
                         samplerate=samplerate,
                         inference_sequence_factor=4)
예제 #12
0
class ComplexSTFTExperiment(Experiment):

    N_MELS = 128
    FEATURE_SIZE = 32
    SAMPLERATE = zounds.SR22050()
    N_FFT = 1024
    HOP = 256
    TOTAL_SAMPLES = 8192
    AUDIO_REPR_CLASS = ComplextSTFT

    @classmethod
    def make_generator(cls):
        return ComplextSTFTGenerator(cls.N_MELS, cls.N_FFT, cls.HOP)

    def __init__(self):
        super().__init__(generator=self.make_generator(),
                         discriminator=ComplextSTFTDiscriminator(
                             window_size=self.N_FFT,
                             hop=self.HOP,
                             conditioning_channels=self.N_MELS),
                         learning_rate=1e-4,
                         feature_size=self.FEATURE_SIZE,
                         audio_repr_class=self.AUDIO_REPR_CLASS,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (self.SAMPLERATE, )),
                             'spectrogram': (spectrogram, (self.SAMPLERATE, ))
                         },
                         total_samples=self.TOTAL_SAMPLES,
                         feature_channels=self.N_MELS,
                         samplerate=self.SAMPLERATE,
                         inference_sequence_factor=4)
예제 #13
0
import argparse
import featureflow as ff
import zounds


class Settings(ff.PersistenceSettings):
    id_provider = ff.UuidProvider()
    key_builder = ff.StringDelimitedKeyBuilder()
    database = ff.LmdbDatabase(path='timbre', key_builder=key_builder)


windowing = zounds.HalfLapped()
STFT = zounds.stft(resample_to=zounds.SR22050(), wscheme=windowing)


class WithTimbre(STFT, Settings):
    bark = zounds.ConstantRateTimeSeriesFeature(
            zounds.BarkBands,
            needs=STFT.fft,
            store=True)

    bfcc = zounds.ConstantRateTimeSeriesFeature(
            zounds.BFCC,
            needs=bark,
            store=True)


@zounds.simple_settings
class BfccKmeans(ff.BaseModel):
    docs = ff.Feature(
            ff.IteratorNode,
예제 #14
0
from featuresynth.data import batch_stream
from featuresynth.feature import audio
from featuresynth.audio.transform import fft_frequency_decompose, fft_resample
from featuresynth.audio import RawAudio
import zounds
import torch
import numpy as np
from matplotlib import pyplot as plt

path = '/hdd/musicnet/train_data'
pattern = '*.wav'
total_samples = 2**17

samplerate = zounds.SR22050()
feature_spec = {'audio': (total_samples, 1)}

feature_funcs = {'audio': (audio, (samplerate, ))}

batch_size = 1
bs = batch_stream(path, pattern, batch_size, feature_spec, 'audio',
                  feature_funcs)

if __name__ == '__main__':
    # app = zounds.ZoundsApp(locals=locals(), globals=globals())
    # app.start_in_thread(9999)
    # samples, = next(bs)
    # samples = torch.from_numpy(samples)
    # min_size = 2 ** (np.log2(total_samples) - 4)
    # bands = fft_frequency_decompose(samples, min_size)
    # samples = zounds.AudioSamples(samples.squeeze(), samplerate)
    # input('Waiting...')
예제 #15
0
        real_part, imag_part = fft.unbind(-1)
        magnitude = torch.sqrt(real_part**2 + imag_part**2)
        mel_output = torch.matmul(self.mel_basis, magnitude)
        log_mel_spec = torch.log10(torch.clamp(mel_output, min=1e-5))
        return log_mel_spec


data_cache = LmdbCollection('datacache')


@cache(data_cache)
def audio(file_chunk, samplerate):
    file_path, start, stop = file_chunk
    samples = zounds.AudioSamples.from_file(file_path).mono[start:stop]
    samples = librosa.resample(samples, int(samples.samplerate),
                               int(samplerate))
    samples = librosa.util.normalize(samples, axis=-1) * 0.95
    return samples.astype(np.float32)


audio_to_mel_22050 = Audio2Mel(1024, 256, 1024, int(zounds.SR22050()), 128)


@cache(data_cache)
def spectrogram(file_chunk, samplerate):
    print(file_chunk)
    samples = audio(file_chunk, samplerate)[:]
    spec = audio_to_mel_22050(samples)
    spec = spec.data.cpu().numpy().T.astype(np.float32)
    return spec
예제 #16
0
class FilterBankExperiment(Experiment):
    """
    This is probably the best audio quality yet.  The audio is relatively
    crisp, spectrograms are indistinguishable from real speech, although they
    are hard to understand.

    There are definite phase issues here and there after 12 hours.

    Overall, the texture of the speech is more realistic than what's produced
    by the basic MelGAN setup.
    """

    N_MELS = 128
    FEATURE_SIZE = 32
    SAMPLERATE = zounds.SR22050()
    N_FFT = 1024
    HOP = 256
    TOTAL_SAMPLES = 8192
    AUDIO_REPR_CLASS = RawAudio

    @classmethod
    def make_filter_bank(cls, samplerate):
        scale = zounds.LinearScale(
            zounds.FrequencyBand(20, samplerate.nyquist - 20), 128)
        filter_bank = zounds.learn.FilterBank(samplerate,
                                              511,
                                              scale,
                                              0.9,
                                              normalize_filters=True,
                                              a_weighting=False)
        return filter_bank

    @classmethod
    def make_generator(cls, filter_bank=None):
        filter_bank = filter_bank or cls.make_filter_bank(cls.SAMPLERATE)
        return FilterBankGenerator(filter_bank, cls.FEATURE_SIZE,
                                   cls.TOTAL_SAMPLES, cls.N_MELS)

    def __init__(self):
        filter_bank = self.make_filter_bank(self.SAMPLERATE)

        super().__init__(generator=self.make_generator(),
                         discriminator=FilterBankDiscriminator(
                             filter_bank, self.TOTAL_SAMPLES),
                         learning_rate=1e-4,
                         feature_size=self.FEATURE_SIZE,
                         audio_repr_class=self.AUDIO_REPR_CLASS,
                         generator_loss=mel_gan_gen_loss,
                         sub_gen_loss=least_squares_generator_loss,
                         discriminator_loss=mel_gan_disc_loss,
                         sub_disc_loss=least_squares_disc_loss,
                         g_init=weights_init,
                         d_init=weights_init,
                         feature_funcs={
                             'audio': (audio, (self.SAMPLERATE, )),
                             'spectrogram': (spectrogram, (self.SAMPLERATE, ))
                         },
                         total_samples=self.TOTAL_SAMPLES,
                         feature_channels=self.N_MELS,
                         samplerate=self.SAMPLERATE,
                         inference_sequence_factor=4)
예제 #17
0
    dct = zounds.ArrayWithUnitsFeature(zounds.DCT,
                                       scale_always_even=True,
                                       needs=long_windowed,
                                       store=True)

    mdct = zounds.FrequencyAdaptiveFeature(zounds.FrequencyAdaptiveTransform,
                                           transform=scipy.fftpack.idct,
                                           scale=scale,
                                           needs=dct,
                                           store=True)


if __name__ == '__main__':
    # generate some audio
    synth = zounds.TickSynthesizer(zounds.SR22050())
    orig_audio = synth.synthesize(zounds.Seconds(5), zounds.Milliseconds(200))

    # analyze the audio
    _id = Document.process(meta=orig_audio.encode())
    doc = Document(_id)

    synth = zounds.FrequencyAdaptiveDCTSynthesizer(scale, samplerate)
    recon_audio = synth.synthesize(doc.mdct)

    # get a rasterized visualization of the representation
    img = doc.mdct.square(100, do_overlap_add=True)

    app = zounds.ZoundsApp(model=Document,
                           audio_feature=Document.ogg,
                           visualization_feature=Document.bark,
예제 #18
0
import librosa
# from featuresynth.data import DataStore
# from featuresynth.feature.spectrogram import FilterBankSpectrogram
# from featuresynth.audio import MelScalePhaseRecover, GeometricScalePhaseRecover
import time
from featuresynth.data.conjure import cache, LmdbCollection
from featuresynth.data.filesystem import iter_files
from featuresynth.audio import ComplextSTFT
import zounds
import torch

if __name__ == '__main__':
    app = zounds.ZoundsApp(globals=globals(), locals=locals())
    app.start_in_thread(9999)

    sr = zounds.SR22050()
    # synth = zounds.SineSynthesizer(sr)
    # samples = synth.synthesize(
    #     zounds.Seconds(2), [110, 220, 440, 880]).astype(np.float32)
    file_path = next(iter_files('/hdd/LJSpeech-1.1', '*.wav'))
    samples = zounds.AudioSamples.from_file(file_path).astype(np.float32)

    r = ComplextSTFT.from_audio(samples[None, None, :], sr)
    phase = r.phase
    phase[:] = np.random.uniform(-np.pi, np.pi, phase.shape)
    recon = r.listen()

    scale = zounds.MelScale(zounds.FrequencyBand(20, sr.nyquist - 20), 256)
    filter_bank = zounds.learn.FilterBank(sr,
                                          1024,
                                          scale,