示例#1
0
def preProcessingSample(win_length,
                        window=False,
                        preEmphasis=False,
                        spec=False,
                        n_fft=None,
                        n_hop=None,
                        log=False,
                        power=1.0):

    x = Input(shape=(win_length, 1))
    y = x
    if window:
        y = Lambda((Models.Window), name='output')(y)
    if preEmphasis:
        y = Lambda((Models.PreEmphasis), name='preEmph')(y)
    if spec:
        spectrogram = Spectrogram(n_dft=n_fft,
                                  n_hop=n_hop,
                                  input_shape=(1, win_length),
                                  return_decibel_spectrogram=log,
                                  power_spectrogram=power,
                                  trainable_kernel=False,
                                  name='spec')

        y_D = Lambda((Models.toPermuteDimensions), name='perm_mel')(y)
        y_D = spectrogram(y_D)
        model = Model(inputs=[x], outputs=[y_D, y])
    else:
        model = Model(inputs=[x], outputs=[y])

    return model
示例#2
0
    def _test_stereo_same():
        """Tests for
            - stereo input
            - same padding
            - shapes of output channel, n_freq, n_frame
            - save and load a model with it

        """
        n_ch = 2
        n_dft, len_hop, nsp_src = 512, 256, 8000
        src = np.random.uniform(-1., 1., (n_ch, nsp_src))

        model = keras.models.Sequential()
        model.add(
            Spectrogram(n_dft=n_dft,
                        n_hop=len_hop,
                        padding='same',
                        power_spectrogram=1.0,
                        return_decibel_spectrogram=False,
                        image_data_format='default',
                        input_shape=(n_ch, nsp_src)))
        batch_stft_kapre = model.predict(src[np.newaxis, :])

        # check num_channel
        if image_data_format() == 'channels_last':
            assert batch_stft_kapre.shape[3] == n_ch
            assert batch_stft_kapre.shape[1] == n_dft // 2 + 1
            assert batch_stft_kapre.shape[2] == _num_frame_same(
                nsp_src, len_hop)
        else:
            assert batch_stft_kapre.shape[1] == n_ch
            assert batch_stft_kapre.shape[2] == n_dft // 2 + 1
            assert batch_stft_kapre.shape[3] == _num_frame_same(
                nsp_src, len_hop)
示例#3
0
 def __spec_model(self, input_shape, decibel_gram):
     model = Sequential()
     model.add(Spectrogram(
         return_decibel_spectrogram = decibel_gram,
         input_shape=input_shape
     ))
     model.add(Normalization2D(str_axis='freq'))
     return model
def assemble_model(
    src: np.ndarray,
    n_outputs: int,
    arch_layers: list,
    n_dft: int = 512,  # Orig:128
    n_hop: int = 256,  #  Orig:64
    data_format: str = "channels_first",
) -> keras.Model:

    inputs = keras.Input(shape=src.shape, name="stft")

    # @paper: Spectrogram based CNN that receives the (log) spectrogram matrix as input

    # @kapre:
    # abs(Spectrogram) in a shape of 2D data, i.e.,
    # `(None, n_channel, n_freq, n_time)` if `'channels_first'`,
    # `(None, n_freq, n_time, n_channel)` if `'channels_last'`,
    x = Spectrogram(
        n_dft=n_dft,
        n_hop=n_hop,
        input_shape=src.shape,
        trainable_kernel=True,
        name="static_stft",
        image_data_format=data_format,
        return_decibel_spectrogram=True,
    )(inputs)

    # Swaps order to match the paper?
    # TODO: dig in to this (GPU only?)
    if data_format == "channels_first":  # n_channel, n_freq, n_time)
        x = keras.layers.Permute((1, 3, 2))(x)
    else:
        x = keras.layers.Permute((2, 1, 3))(x)

    for arch_layer in arch_layers:
        x = keras.layers.Conv2D(
            arch_layer.filters,
            arch_layer.window_size,
            strides=arch_layer.strides,
            activation=arch_layer.activation,
            data_format=data_format,
        )(x)

    # Flatten down to a single dimension
    x = keras.layers.Flatten()(x)

    # @paper: sigmoid activations with binary cross entropy loss
    # @paper: FC-512
    x = keras.layers.Dense(512)(x)

    # @paper: FC-368(sigmoid)
    outputs = keras.layers.Dense(n_outputs,
                                 activation="sigmoid",
                                 name="predictions")(x)

    return keras.Model(inputs=inputs, outputs=outputs)
示例#5
0
def stft_model(audio_len, normalize=True, **kwargs):
    """Build an STFT preprocessing model.
    
    Pass normalize=False to disable the normalization layer.
    Pass arguments to https://github.com/keunwoochoi/kapre/blob/master/kapre/time_frequency.py#L11."""
    return Sequential([
        Spectrogram(input_shape=(1, audio_len), **kwargs),
    ] + ([
        Normalization2D(str_axis='freq'),
    ] if normalize else []))
def make_kapre_mag_maker(n_fft=1024, hop_length=128, audio_data_len=80000):
    stft_model = keras.models.Sequential()
    stft_model.add(
        Spectrogram(n_dft=n_fft,
                    n_hop=hop_length,
                    input_shape=(1, audio_data_len),
                    power_spectrogram=2.0,
                    return_decibel_spectrogram=False,
                    trainable_kernel=False,
                    name='stft'))
    return stft_model
示例#7
0
    def _test_correctness():
        """ Tests correctness
        """
        audio_data = np.load('tests/speech_test_file.npz')['audio_data']
        sr = 44100

        hop_length = 128
        n_fft = 1024
        n_mels = 80

        # compute with librosa
        S = librosa.core.stft(audio_data, n_fft=n_fft, hop_length=hop_length)
        magnitudes_librosa = librosa.magphase(S, power=2)[0]
        S_DB_librosa = librosa.power_to_db(magnitudes_librosa, ref=np.max)

        # load precomputed
        magnitudes_expected = np.load('tests/test_audio_stft_g0.npy')

        # compute with kapre
        stft_model = tensorflow.keras.models.Sequential()
        stft_model.add(
            Spectrogram(
                n_dft=n_fft,
                n_hop=hop_length,
                input_shape=(len(audio_data),
                             1) if image_data_format() == 'channels_last' else
                (1, len(audio_data)),
                power_spectrogram=2.0,
                return_decibel_spectrogram=False,
                trainable_kernel=False,
                name='stft',
            ))

        S = stft_model.predict(
            audio_data.reshape(1, -1, 1) if image_data_format() ==
            'channels_last' else audio_data.reshape(1, 1, -1))

        if image_data_format() == 'channels_last':
            S = S[0, :, :, 0]
        else:
            S = S[0, 0]
        magnitudes_kapre = librosa.magphase(S, power=1)[0]
        S_DB_kapre = librosa.power_to_db(magnitudes_kapre, ref=np.max)

        DB_scale = np.max(S_DB_librosa) - np.min(S_DB_librosa)
        S_DB_dif = np.abs(S_DB_kapre - S_DB_librosa) / DB_scale

        assert np.allclose(magnitudes_expected,
                           magnitudes_kapre,
                           rtol=1e-2,
                           atol=1e-8)
        assert np.mean(S_DB_dif) < 0.015
def test_plot():
    SR = 16000
    src = np.random.random((1, SR * 3))
    src_cute, _ = librosa.load(
        '/Users/admin/Dropbox/workspace/unet/data/audio/abjones_1_01.wav',
        sr=SR,
        mono=True)
    model = Sequential()
    model.add(
        Melspectrogram(sr=SR,
                       n_mels=128,
                       n_dft=512,
                       n_hop=256,
                       input_shape=src.shape,
                       return_decibel_melgram=True,
                       trainable_kernel=True,
                       name='melgram'))

    check_model(model)
    visualise_model(model)

    SR = 16000
    src = np.random.random((1, SR * 3))
    model = Sequential()
    model.add(
        Spectrogram(n_dft=512,
                    n_hop=256,
                    input_shape=src.shape,
                    return_decibel_spectrogram=False,
                    power_spectrogram=2.0,
                    trainable_kernel=False,
                    name='static_stft'))

    check_model(model)
    plt.figure(figsize=(14, 4))
    plt.subplot(1, 2, 1)
    plt.title('log-Spectrogram by Kapre')
    visualise_model(model, logam=True)
    plt.subplot(1, 2, 2)
    display.specshow(librosa.amplitude_to_db(np.abs(
        librosa.stft(src_cute[:SR * 3], 512, 256))**2,
                                             ref=1.0),
                     y_axis='linear',
                     sr=SR)
    plt.title('log-Spectrogram by Librosa')
    plt.show()
示例#9
0
 def build_CNN_model(self):
     ### define CNN architecture
     print('Build model...')
     self.model = Sequential()
     self.model.add(Spectrogram(n_dft=128, n_hop=16, input_shape=(self.x_augmented_rolled.shape[1:]),
                           return_decibel_spectrogram=False, power_spectrogram=2.0,
                           trainable_kernel=False, name='static_stft'))
     self.model.add(Normalization2D(str_axis = 'freq'))
     
     # Conv Block 1
     self.model.add(Conv2D(filters = 24, kernel_size = (12, 12), 
                      strides = (1, 1), name = 'conv1', 
                      border_mode = 'same'))
     self.model.add(BatchNormalization(axis = 1))
     self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2,2), padding = 'valid', 
                            data_format = 'channels_last'))
     self.model.add(Activation('relu'))
     self.model.add(Dropout(self.dropout))
     
     # Conv Block 2
     self.model.add(Conv2D(filters = 48, kernel_size = (8, 8),
                      name = 'conv2', border_mode = 'same'))
     self.model.add(BatchNormalization(axis = 1))
     self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2), padding = 'valid',
                            data_format = 'channels_last'))
     self.model.add(Activation('relu'))
     self.model.add(Dropout(self.dropout))
     
     # Conv Block 3
     self.model.add(Conv2D(filters = 96, kernel_size = (4, 4),
                      name = 'conv3', border_mode = 'same'))
     self.model.add(BatchNormalization(axis = 1))
     self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2,2), 
                            padding = 'valid',
                            data_format = 'channels_last'))
     self.model.add(Activation('relu'))
     self.model.add(Dropout(self.dropout))
     
     # classificator
     self.model.add(Flatten())
     self.model.add(Dense(self.n_classes))  # two classes only
     self.model.add(Activation('softmax'))
     
     print(self.model.summary())
     self.saved_model_name = self.MODELNAME
def assemble_model(
    src: np.ndarray,
    arch_layers: list,
    n_dft: int = 128,
    n_hop: int = 64,
    data_format: str = 'channels_first',
) -> keras.Model:

    inputs = keras.Input(shape=src.shape, name='stft')

    # @paper: Spectrogram based CNN that receives the (log) spectrogram matrix as input

    # @kapre:
    # abs(Spectrogram) in a shape of 2D data, i.e.,
    # `(None, n_channel, n_freq, n_time)` if `'channels_first'`,
    # `(None, n_freq, n_time, n_channel)` if `'channels_last'`,
    x: Spectrogram = Spectrogram(
        n_dft=n_dft,
        n_hop=n_hop,
        input_shape=src.shape,
        trainable_kernel=True,
        name='static_stft',
        image_data_format=data_format,
        return_decibel_spectrogram=True,
    )(inputs)

    for arch_layer in arch_layers:
        x = keras.layers.Conv2D(
            arch_layer.filters,
            arch_layer.window_size,
            strides=arch_layer.strides,
            activation=arch_layer.activation,
            data_format=data_format,
        )(x)

    # @paper: sigmoid activations with binary cross entropy loss
    # @paper: FC-512
    x = keras.layers.Dense(512)(x)

    # @paper: FC-368(sigmoid)
    outputs = keras.layers.Dense(368, activation='sigmoid',
                                 name='predictions')(x)

    return keras.Model(inputs=inputs, outputs=outputs)
示例#11
0
def SqueezeNet(input_tensor=None,
               input_shape=(1, 44100 * 3),
               classes=len(classes)):
    inputs = Input(shape=input_shape)
    x = Spectrogram(n_dft=512, return_decibel_spectrogram=True)(inputs)
    x = AdditiveNoise(power=0.3, random_gain=True)(x)
    x = Convolution2D(64, (3, 3),
                      strides=(2, 2),
                      padding='valid',
                      name='conv1')(x)
    x = PReLU(name='prelu_conv1')(x)
    x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool1')(x)

    # first simple bypass
    fire2 = fire_module(x, fire_id=2, squeeze=16, expand=64)
    fire3 = fire_module(fire2, fire_id=3, squeeze=16, expand=64)
    x = add([fire2, fire3])
    x = fire_module(x, fire_id=4, squeeze=32, expand=128)

    # second simple bypass
    maxpool1 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool3')(x)
    fire5 = fire_module(maxpool1, fire_id=5, squeeze=32, expand=128)
    x = add([maxpool1, fire5])

    # third simple bypass
    fire6 = fire_module(x, fire_id=6, squeeze=48, expand=192)
    fire7 = fire_module(fire6, fire_id=7, squeeze=48, expand=192)
    x = add([fire6, fire7])
    x = fire_module(x, fire_id=8, squeeze=64, expand=256)
    maxpool2 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool5')(x)

    fire9 = fire_module(maxpool2, fire_id=9, squeeze=64, expand=256)
    x = add([maxpool2, fire9])

    x = Dropout(0.5, name='drop9')(x)

    x = Convolution2D(classes, (1, 1), padding='valid', name='conv10')(x)
    x = PReLU(name='prelu_conv10')(x)
    x = GlobalAveragePooling2D()(x)
    out = Activation('softmax', name='loss')(x)

    model = Model(inputs, out, name='squeezenet')
    return model
示例#12
0
def createCNN(networkType, numberOfChannels, frameSize, spectroWindowSize,
              spectroWindowShift, numberOfClasses):

    input_shape = (numberOfChannels, frameSize)
    spectrogramLayer = Spectrogram(input_shape=input_shape,
                                   n_dft=spectroWindowSize,
                                   n_hop=spectroWindowShift,
                                   padding='same',
                                   power_spectrogram=1.0,
                                   return_decibel_spectrogram=True)
    if networkType == NetworkType.CNN_PROPOSED_MASTER_THESIS:
        return createProposedNet(spectrogramLayer, numberOfClasses)
    elif networkType == NetworkType.CNN_SHALLOW:
        return True
    elif networkType == NetworkType.CNN_DEEP:
        return True
    elif networkType == NetworkType.CNN_PROPOSED_SMALL:
        return createProposedSmall(spectrogramLayer, numberOfClasses)
    elif networkType == NetworkType.CNN_RAW:
        return createRawNet(spectrogramLayer, numberOfClasses)
    else:
        raise ValueError("NetworkType not recognized! type: ", networkType)
示例#13
0
    def _test_mono_valid():
        """Tests for
            - mono input
            - valid padding
            - shapes of output channel, n_freq, n_frame
            - save and load a model with it

        """
        n_ch = 1
        n_dft, len_hop, nsp_src = 512, 256, 8000
        src = np.random.uniform(-1.0, 1.0, nsp_src)

        model = tensorflow.keras.models.Sequential()
        model.add(
            Spectrogram(
                n_dft=n_dft,
                n_hop=len_hop,
                padding='valid',
                power_spectrogram=1.0,
                return_decibel_spectrogram=False,
                image_data_format='default',
                input_shape=(nsp_src, n_ch)
                if image_data_format() == 'channels_last' else (n_ch, nsp_src),
            ))
        batch_stft_kapre = model.predict(
            src[np.newaxis, ..., np.newaxis] if image_data_format() ==
            'channels_last' else src[np.newaxis, np.newaxis, ...])

        # check num_channel
        if image_data_format() == 'channels_last':
            assert batch_stft_kapre.shape[3] == n_ch
            assert batch_stft_kapre.shape[1] == n_dft // 2 + 1
            assert batch_stft_kapre.shape[2] == _num_frame_valid(
                nsp_src, n_dft, len_hop)
        else:
            assert batch_stft_kapre.shape[1] == n_ch
            assert batch_stft_kapre.shape[2] == n_dft // 2 + 1
            assert batch_stft_kapre.shape[3] == _num_frame_valid(
                nsp_src, n_dft, len_hop)
def MLP_model(input_shape, dropout=0.5, print_summary=False):
    # basis of the CNN_STFT is a Sequential network
    model = Sequential()

    # spectrogram creation using STFT
    model.add(
        Spectrogram(n_dft=128,
                    n_hop=16,
                    input_shape=input_shape,
                    return_decibel_spectrogram=False,
                    power_spectrogram=2.0,
                    trainable_kernel=False,
                    name='static_stft'))
    model.add(Normalization2D(str_axis='freq'))
    model.add(Flatten())
    model.add(Dense(neurons_per_layer, activation='relu', input_shape=(784, )))
    model.add(Dropout(0.2))

    # custom number of hidden layers
    for each in range(n_hidden_layers - 1):
        model.add(Dense(neurons_per_layer, activation='relu'))
        model.add(Dropout(0.2))

    model.add(Dense(2))  # two classes only
    model.add(Activation('softmax'))

    if print_summary:
        print(model.summary())

    # compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # assign model and return

    return model
示例#15
0
def get_network(args):
    x_in = Input(
        shape=args['shape']
    )  # Expected 2D array: (audio_channel, audio_length), TODO flip the dimensions!
    x = Spectrogram(n_dft=args['n_dft'], n_hop=int(args['n_dft'] / 2))(x_in)

    for _ in range(args['conv']['n_blocks']):
        x = ResidualConvBlock(args['conv']['n_layers'],
                              args['conv']['n_filters'],
                              args['conv']['kernel_size'])(x)
        curr_shape = K.int_shape(x)
        if (curr_shape[1] > args['pool_size'][0]
                and curr_shape[2] > args['pool_size'][1]):
            x = MaxPooling2D(pool_size=args['pool_size'])(x)

    for _ in range(args['dense']['n_layers']):
        x = Dense(units=args['dense']['n_units'], use_bias=False)(x)
        x = BatchNormalization()(x)
        x = LeakyReLU(alpha=0.3)(x)

    x_out = Dense(units=args['n_genres'], activation='softmax')(x)

    model = Model(inputs=x_in, outputs=x_out)
    return model
示例#16
0
def construct_tiny_L3_audio_model():
    """
    Constructs a model that implements a small L3 audio subnetwork

    Returns
    -------
    model:  L3 CNN model
            (Type: keras.models.Model)
    inputs: Model inputs
            (Type: list[keras.layers.Input])
    outputs: Model outputs
            (Type: keras.layers.Layer)
    """
    weight_decay = 1e-5
    ####
    # Audio subnetwork
    ####
    n_dft = 512
    n_win = 480
    n_hop = n_win // 2
    asr = 48000
    audio_window_dur = 1
    # INPUT
    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')

    # SPECTROGRAM PREPROCESSING
    y_a = Spectrogram(n_dft=n_dft,
                      n_win=n_win,
                      n_hop=n_hop,
                      return_decibel_spectrogram=True,
                      padding='valid')(x_a)

    y_a = Conv2D(10, (5, 5),
                 padding='valid',
                 strides=(1, 1),
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=(3, 3), strides=3)(y_a)
    y_a = Conv2D(10, (5, 5),
                 padding='valid',
                 strides=(1, 1),
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=(3, 3), strides=3)(y_a)
    y_a = Conv2D(10, (5, 5),
                 padding='valid',
                 strides=(1, 1),
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=(3, 3), strides=3)(y_a)
    y_a = Flatten(name='embedding')(y_a)
    m = Model(inputs=x_a, outputs=y_a)
    m.name = 'audio_model'

    return m, x_a, y_a
示例#17
0
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from kapre.time_frequency import Spectrogram
from kapre.utils import Normalization2D
from kapre.augmentation import AdditiveNoise
import keras
from keras import optimizers
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


classifier = Sequential()

classifier.add(Spectrogram(n_dft=512, n_hop=256,padding='same',input_shape=input_shape, 
	power_spectrogram=2.0,return_decibel_spectrogram=False, trainable_kernel=False,image_data_format='default'))

classifier.add(AdditiveNoise(power=0.2))
classifier.add(Normalization2D(str_axis='freq'))
#Layer 1
classifier.add(Conv2D(24, (1, 1), input_shape = (7192,11, 1000), activation = 'relu'))
keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
classifier.add(MaxPooling2D(pool_size = (2, 2)))
classifier.add(Dense(units = 128, activation = 'relu'))
keras.layers.Dropout(0.5, noise_shape=None, seed=None)
#Layer 2
classifier.add(Conv2D(48, (1,1), input_shape = (32, 32,24), activation = 'relu'))
keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
classifier.add(MaxPooling2D(pool_size = (2, 2)))
classifier.add(Dense(units = 128, activation = 'relu'))
keras.layers.Dropout(0.5, noise_shape=None, seed=None)
def model_1(win_length, filters, kernel_size_1, learning_rate, batch):

    kPs = int((win_length * 2000 / kSR))
    kN = int(win_length)

    ini1 = tf.initializers.random_uniform(minval=-1, maxval=1)
    ini2 = tf.initializers.random_uniform(minval=0, maxval=1)

    x = Input(shape=(kContext * 2 + 1, win_length, 1),
              name='input',
              batch_shape=(batch, kContext * 2 + 1, win_length, 1))

    conv = Conv1D(filters,
                  kernel_size_1,
                  strides=1,
                  padding='same',
                  kernel_initializer='lecun_uniform',
                  input_shape=(win_length, 1))

    activation_abs = Activation(K.abs)
    activation_sp = Activation('softplus')
    max_pooling = MaxPooling1D(pool_size=win_length // 64)

    conv_smoothing = Conv1D_local(filters,
                                  kernel_size_1 * 2,
                                  strides=1,
                                  padding='same',
                                  kernel_initializer='lecun_uniform')

    dense_sgn = Dense(kPs,
                      activation='tanh',
                      kernel_initializer=ini1,
                      name='dense_l_sgn')

    dense_idx = Dense(kPs, activation='sigmoid', name='dense_l_idx')

    bi_rnn = Bidirectional(LSTM(filters * 2,
                                activation='tanh',
                                stateful=False,
                                return_sequences=True,
                                dropout=0.1,
                                recurrent_dropout=0.1),
                           merge_mode='concat',
                           name='birnn_in')
    bi_rnn1 = Bidirectional(LSTM(filters,
                                 activation='tanh',
                                 stateful=False,
                                 return_sequences=True,
                                 dropout=0.1,
                                 recurrent_dropout=0.1),
                            merge_mode='concat',
                            name='birnn_1')
    bi_rnn2 = Bidirectional(LSTM(filters // 2,
                                 activation='linear',
                                 stateful=False,
                                 return_sequences=True,
                                 dropout=0.1,
                                 recurrent_dropout=0.1),
                            merge_mode='concat',
                            name='birnn_2')

    bi_rnn3 = Bidirectional(LSTM(filters // 2,
                                 activation='linear',
                                 stateful=False,
                                 return_sequences=True,
                                 dropout=0.1,
                                 recurrent_dropout=0.1),
                            merge_mode='concat',
                            name='birnn_3')

    convTensors = Conv1D_localTensor(filters,
                                     win_length,
                                     batch,
                                     strides=1,
                                     padding='same',
                                     name='convTensors')

    deconv = Conv1D_tied(1, kernel_size_1, conv, padding='same', name='deconv')

    velvet = VelvetNoise(kPs,
                         batch,
                         input_dim=filters,
                         input_length=win_length,
                         name='velvet')

    X = TimeDistributed(conv, name='conv')(x)
    X_abs = TimeDistributed(activation_abs, name='conv_activation')(X)
    M = TimeDistributed(conv_smoothing, name='conv_smoothing')(X_abs)
    M = TimeDistributed(activation_sp, name='conv_smoothing_activation')(M)
    P = X
    Z = TimeDistributed(max_pooling, name='max_pooling')(M)
    Z = Lambda(lambda inputs: tf.unstack(
        inputs, num=kContext * 2 + 1, axis=1, name='unstack2'))(Z)
    Z = Concatenate(name='concatenate')(Z)

    Z = bi_rnn(Z)
    Z1 = bi_rnn1(Z)
    Z1 = bi_rnn2(Z1)
    Z1 = SAAF(break_points=25,
              break_range=0.2,
              magnitude=100,
              order=2,
              tied_feamap=True,
              kernel_initializer='random_normal',
              name='saaf_1')(Z1)

    Z2 = bi_rnn3(Z)
    Z2 = SAAF(break_points=25,
              break_range=0.2,
              magnitude=100,
              order=2,
              tied_feamap=True,
              kernel_initializer='random_normal',
              name='saaf_2')(Z2)

    Z1 = Lambda((toPermuteDimensions), name='perm_1')(Z1)

    sgn = dense_sgn(Z1)
    idx = dense_idx(Z1)

    sgn = Lambda((toPermuteDimensions), name='perm_2')(sgn)
    idx = Lambda((toPermuteDimensions), name='perm_3')(idx)

    P = Lambda(lambda inputs: tf.unstack(
        inputs, num=kContext * 2 + 1, axis=1, name='unstack'))(P)
    V = Concatenate(name='concatenate2', axis=-1)([sgn, idx])
    V = velvet(V)

    Y = Concatenate(name='concatenate3')([P[kContext], V])
    Y = convTensors(Y)
    Y = SAAF(break_points=25,
             break_range=0.2,
             magnitude=100,
             order=2,
             tied_feamap=True,
             kernel_initializer='random_normal',
             name='saaf_out_conv')(Y)

    M_ = UpSampling1D(size=win_length // 64, name='up_sampling_naive')(Z2)
    Y = Multiply(name='phase_unpool_multiplication')([Y, M_])

    Y_ = Dense(filters, activation='tanh', name='dense_in')(Y)
    Y_ = Dense(filters // 2, activation='tanh', name='dense_h1')(Y_)
    Y_ = Dense(filters // 2, activation='tanh', name='dense_h2')(Y_)
    Y_ = Dense(filters, activation='linear', name='dense_out')(Y_)
    Y_ = SAAF(break_points=25,
              break_range=0.2,
              magnitude=100,
              order=2,
              tied_feamap=True,
              kernel_initializer='random_normal',
              name='saaf_out')(Y_)

    Y = se_block_lstm(Y, filters, weight_decay=0., amplifying_ratio=16, idx=1)
    Y_ = se_block_lstm(Y_,
                       filters,
                       weight_decay=0.,
                       amplifying_ratio=16,
                       idx=2)

    Y = Add(name='addition')([Y, Y_])
    Y = deconv(Y)

    Y = Lambda((Window), name='waveform')(Y)

    loss_output = Spectrogram(n_dft=win_length,
                              n_hop=win_length,
                              input_shape=(1, win_length),
                              return_decibel_spectrogram=True,
                              power_spectrogram=2.0,
                              trainable_kernel=False,
                              name='spec')

    spec = Lambda((toPermuteDimensions), name='perm_spec')(Y)
    spec = loss_output(spec)

    model = Model(inputs=[x], outputs=[spec, Y])

    model.compile(loss={
        'spec': 'mse',
        'waveform': MAE_preEmphasis
    },
                  loss_weights={
                      'spec': 0.0001,
                      'waveform': 1.0
                  },
                  optimizer=Adam(lr=learning_rate))

    return model
def model_2(win_length, filters, kernel_size_1, learning_rate):

    kContext = 4  # past and subsequent frames

    x = Input(shape=(kContext * 2 + 1, win_length, 1), name='input')

    conv = Conv1D(filters,
                  kernel_size_1,
                  strides=1,
                  padding='same',
                  kernel_initializer='lecun_uniform',
                  input_shape=(win_length, 1))

    activation_abs = Activation(K.abs)
    activation_sp = Activation('softplus')
    max_pooling = MaxPooling1D(pool_size=win_length // 64)

    conv_smoothing = Conv1D_local(filters,
                                  kernel_size_1 * 2,
                                  strides=1,
                                  padding='same',
                                  kernel_initializer='lecun_uniform')

    bi_rnn = Bidirectional(LSTM(filters * 2,
                                activation='tanh',
                                stateful=False,
                                return_sequences=True,
                                dropout=0.1,
                                recurrent_dropout=0.1),
                           merge_mode='concat',
                           name='birnn_in')
    bi_rnn1 = Bidirectional(LSTM(filters,
                                 activation='tanh',
                                 stateful=False,
                                 return_sequences=True,
                                 dropout=0.1,
                                 recurrent_dropout=0.1),
                            merge_mode='concat',
                            name='birnn_1')
    bi_rnn2 = Bidirectional(LSTM(filters // 2,
                                 activation='linear',
                                 stateful=False,
                                 return_sequences=True,
                                 dropout=0.1,
                                 recurrent_dropout=0.1),
                            merge_mode='concat',
                            name='birnn_2')

    deconv = Conv1D_tied(1, kernel_size_1, conv, padding='same', name='deconv')

    X = TimeDistributed(conv, name='conv')(x)
    X_abs = TimeDistributed(activation_abs, name='conv_activation')(X)
    M = TimeDistributed(conv_smoothing, name='conv_smoothing')(X_abs)
    M = TimeDistributed(activation_sp, name='conv_smoothing_activation')(M)
    P = X
    Z = TimeDistributed(max_pooling, name='max_pooling')(M)
    Z = Lambda(lambda inputs: tf.unstack(
        inputs, num=kContext * 2 + 1, axis=1, name='unstack2'))(Z)
    Z = Concatenate(name='concatenate')(Z)

    Z = bi_rnn(Z)
    Z = bi_rnn1(Z)
    Z = bi_rnn2(Z)
    Z = SAAF(break_points=25,
             break_range=0.2,
             magnitude=100,
             order=2,
             tied_feamap=True,
             kernel_initializer='random_normal',
             name='saaf_1')(Z)

    M_ = UpSampling1D(size=win_length // 64, name='up_sampling_naive')(Z)
    P = Lambda(lambda inputs: tf.unstack(
        inputs, num=kContext * 2 + 1, axis=1, name='unstack'))(P)
    Y = Multiply(name='phase_unpool_multiplication')([P[kContext], M_])

    Y_ = Dense(filters, activation='tanh', name='dense_in')(Y)
    Y_ = Dense(filters // 2, activation='tanh', name='dense_h1')(Y_)
    Y_ = Dense(filters // 2, activation='tanh', name='dense_h2')(Y_)
    Y_ = Dense(filters, activation='linear', name='dense_out')(Y_)
    Y_ = SAAF(break_points=25,
              break_range=0.2,
              magnitude=100,
              order=2,
              tied_feamap=True,
              kernel_initializer='random_normal',
              name='saaf_out')(Y_)

    Y_ = se_block(Y_, filters, weight_decay=0., amplifying_ratio=16, idx=1)
    Y = Add(name='addition')([Y, Y_])
    Y = deconv(Y)

    Y = Lambda((Window), name='waveform')(Y)

    loss_output = Spectrogram(n_dft=win_length,
                              n_hop=win_length,
                              input_shape=(1, win_length),
                              return_decibel_spectrogram=True,
                              power_spectrogram=2.0,
                              trainable_kernel=False,
                              name='spec')

    spec = Lambda((toPermuteDimensions), name='perm_spec')(Y)
    spec = loss_output(spec)

    model = Model(inputs=[x], outputs=[spec, Y])

    model.compile(loss={
        'spec': 'mse',
        'waveform': MAE_preEmphasis
    },
                  loss_weights={
                      'spec': 0.0001,
                      'waveform': 1.0
                  },
                  optimizer=Adam(lr=learning_rate))

    return model
示例#20
0
def add_mel_to_VGGish(content_weights_file_path_og, input_length, sr_hr,
                      n_mels, hoplength, nfft, fmin, fmax, power_melgram):

    NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
    NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
    EMBEDDING_SIZE = 128  # Size of embedding layer.
    pooling = 'avg'
    X = Input(shape=(NUM_FRAMES, NUM_BANDS, 1), name='nob')
    x = X
    x = Conv2D(64, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv1')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)

    # Block 2
    x = Conv2D(128, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)

    # Block 3
    x = Conv2D(256, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv3/conv3_1')(x)
    x = Conv2D(256, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv3/conv3_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)

    # Block 4
    x = Conv2D(512, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv4/conv4_1')(x)
    x = Conv2D(512, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv4/conv4_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)

    if pooling == 'avg':
        x = GlobalAveragePooling2D()(x)
    elif pooling == 'max':
        x = GlobalMaxPooling2D()(x)
    model = Model(inputs=X, outputs=x)

    model.load_weights(content_weights_file_path_og)

    X = Input(shape=(1, input_length), name='input_1')
    x = X
    x = Spectrogram(n_dft=nfft,
                    n_hop=hoplength,
                    padding='same',
                    return_decibel_spectrogram=True,
                    trainable_kernel=False,
                    name='stft')(x)

    x = Normalization2D(str_axis='freq')(x)

    no_input_layers = model.layers[1:]

    for layer in no_input_layers:
        x = layer(x)
    return Model(inputs=X, outputs=x)
示例#21
0
def create_VGGish(input_length,
                  sr_hr,
                  n_mels,
                  hoplength,
                  nfft,
                  fmin,
                  fmax,
                  power_melgram,
                  pooling='avg'):

    X = Input(shape=(input_length, 1), name='input_1')

    x = X

    x = Reshape((1, input_length))(x)

    x = Spectrogram(n_dft=nfft,
                    n_hop=hoplength,
                    padding='same',
                    return_decibel_spectrogram=True,
                    trainable_kernel=False,
                    name='stft')(x)

    x = Normalization2D(str_axis='freq')(x)

    x = Conv2D(64, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv1')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)

    x = Conv2D(128, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)

    x = Conv2D(256, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv3/conv3_1')(x)
    x = Conv2D(256, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv3/conv3_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)

    x = Conv2D(512, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv4/conv4_1')(x)
    x = Conv2D(512, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv4/conv4_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)

    if pooling == 'avg':
        x = GlobalAveragePooling2D()(x)
    elif pooling == 'max':
        x = GlobalMaxPooling2D()(x)

    return X, x
示例#22
0
seconds = sampling_rate * 5
folder = '/data/p253591/youtube_classification/data2/'

with open(f'{folder}/test_data.p3', 'rb') as f:
    X_test, y_test = pickle.load(f)

# normalize spectogram output
slope = K.variable(value=1 / 40)
intercept = K.variable(value=1)

spectogram_model = Sequential()
spectogram_model.add(
    Spectrogram(n_dft=512,
                n_hop=256,
                input_shape=(1, seconds),
                return_decibel_spectrogram=True,
                power_spectrogram=2.0,
                trainable_kernel=False,
                name='static_stft'))
spectogram_model.add(Lambda(lambda x: slope * x + intercept))

for sample in numpy.random.permutation(len(X_test))[:4]:
    y_out = spectogram_model.predict(X_test[sample:sample + 1])

    im = (y_out[0] + 1) / 2.0
    im = im * [[[1, 0, 0]]] + (1 - im) * [[[0, 0, 1]]]
    imsave(f'spectrogram-{sample}-label-{y_test[sample]}.png', im)

model = load_model(f'{folder}/model-2020-01-21-epoch-21.hd5',
                   custom_objects={
                       'Spectrogram': Spectrogram,
示例#23
0
def construct_cnn_L3_orig_audio_model():
    """
    Constructs a model that replicates the audio subnetwork  used in Look,
    Listen and Learn

    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .

    Returns
    -------
    model:  L3 CNN model
            (Type: keras.models.Model)
    inputs: Model inputs
            (Type: list[keras.layers.Input])
    outputs: Model outputs
            (Type: keras.layers.Layer)
    """
    weight_decay = 1e-5
    ####
    # Audio subnetwork
    ####
    n_dft = 512
    #n_win = 480
    #n_hop = n_win//2
    n_hop = 242
    asr = 48000
    audio_window_dur = 1
    # INPUT
    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')

    # SPECTROGRAM PREPROCESSING
    # 257 x 199 x 1
    y_a = Spectrogram(
        n_dft=n_dft,
        n_hop=n_hop,
        power_spectrogram=1.0,  # n_win=n_win,
        return_decibel_spectrogram=False,
        padding='valid')(x_a)

    # Apply normalization from L3 paper
    y_a = Lambda(lambda x: tf.log(tf.maximum(x, 1e-12)) / 5.0)(y_a)

    # CONV BLOCK 1
    n_filter_a_1 = 64
    filt_size_a_1 = (3, 3)
    pool_size_a_1 = (2, 2)
    y_a = Conv2D(n_filter_a_1,
                 filt_size_a_1,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1,
                 filt_size_a_1,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = 128
    filt_size_a_2 = (3, 3)
    pool_size_a_2 = (2, 2)
    y_a = Conv2D(n_filter_a_2,
                 filt_size_a_2,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2,
                 filt_size_a_2,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = 256
    filt_size_a_3 = (3, 3)
    pool_size_a_3 = (2, 2)
    y_a = Conv2D(n_filter_a_3,
                 filt_size_a_3,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3,
                 filt_size_a_3,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = 512
    filt_size_a_4 = (3, 3)
    pool_size_a_4 = (32, 24)
    y_a = Conv2D(n_filter_a_4,
                 filt_size_a_4,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4,
                 filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer',
                 padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a)

    y_a = Flatten()(y_a)

    m = Model(inputs=x_a, outputs=y_a)
    m.name = 'audio_model'

    return m, x_a, y_a
def CNN_model(input_shape, dropout=0.5, print_summary=False):
    # basis of the CNN_STFT is a Sequential network
    model = Sequential()

    # spectrogram creation using STFT
    model.add(
        Spectrogram(n_dft=128,
                    n_hop=16,
                    input_shape=input_shape,
                    return_decibel_spectrogram=False,
                    power_spectrogram=2.0,
                    trainable_kernel=False,
                    name='static_stft'))
    model.add(Normalization2D(str_axis='freq'))

    # Conv Block 1
    model.add(
        Conv2D(filters=24,
               kernel_size=(12, 12),
               strides=(1, 1),
               name='conv1',
               border_mode='same'))
    model.add(BatchNormalization(axis=1))
    model.add(Activation('relu'))
    model.add(
        MaxPooling2D(pool_size=(2, 2),
                     strides=(2, 2),
                     padding='valid',
                     data_format='channels_last'))

    # Conv Block 2
    model.add(
        Conv2D(filters=48,
               kernel_size=(8, 8),
               name='conv2',
               border_mode='same'))
    model.add(BatchNormalization(axis=1))
    model.add(Activation('relu'))
    model.add(
        MaxPooling2D(pool_size=(2, 2),
                     strides=(2, 2),
                     padding='valid',
                     data_format='channels_last'))

    # Conv Block 3
    model.add(
        Conv2D(filters=96,
               kernel_size=(4, 4),
               name='conv3',
               border_mode='same'))
    model.add(BatchNormalization(axis=1))
    model.add(Activation('relu'))
    model.add(
        MaxPooling2D(pool_size=(2, 2),
                     strides=(2, 2),
                     padding='valid',
                     data_format='channels_last'))
    model.add(Dropout(dropout))

    # classificator
    model.add(Flatten())
    model.add(Dense(2))  # two classes only
    model.add(Activation('softmax'))

    if print_summary:
        print(model.summary())

    # compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # assign model and return

    return model
示例#25
0
文件: models.py 项目: wangyu/openl3
def _construct_linear_audio_network():
    """
    Returns an uninitialized model object for a network with a linear
    spectrogram input (With 257 frequency bins)

    Returns
    -------
    model : keras.models.Model
        Model object.
    """

    weight_decay = 1e-5
    n_dft = 512
    n_hop = 242
    asr = 48000
    audio_window_dur = 1

    # INPUT
    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')

    # SPECTROGRAM PREPROCESSING
    # 257 x 199 x 1
    y_a = Spectrogram(n_dft=n_dft,
                      n_hop=n_hop,
                      power_spectrogram=1.0,
                      return_decibel_spectrogram=True,
                      padding='valid')(x_a)
    y_a = BatchNormalization()(y_a)

    # CONV BLOCK 1
    n_filter_a_1 = 64
    filt_size_a_1 = (3, 3)
    pool_size_a_1 = (2, 2)
    y_a = Conv2D(n_filter_a_1,
                 filt_size_a_1,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_1,
                 filt_size_a_1,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a)

    # CONV BLOCK 2
    n_filter_a_2 = 128
    filt_size_a_2 = (3, 3)
    pool_size_a_2 = (2, 2)
    y_a = Conv2D(n_filter_a_2,
                 filt_size_a_2,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_2,
                 filt_size_a_2,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a)

    # CONV BLOCK 3
    n_filter_a_3 = 256
    filt_size_a_3 = (3, 3)
    pool_size_a_3 = (2, 2)
    y_a = Conv2D(n_filter_a_3,
                 filt_size_a_3,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_3,
                 filt_size_a_3,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a)

    # CONV BLOCK 4
    n_filter_a_4 = 512
    filt_size_a_4 = (3, 3)
    pool_size_a_4 = (32, 24)
    y_a = Conv2D(n_filter_a_4,
                 filt_size_a_4,
                 padding='same',
                 kernel_initializer='he_normal',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)
    y_a = BatchNormalization()(y_a)
    y_a = Activation('relu')(y_a)
    y_a = Conv2D(n_filter_a_4,
                 filt_size_a_4,
                 kernel_initializer='he_normal',
                 name='audio_embedding_layer',
                 padding='same',
                 kernel_regularizer=regularizers.l2(weight_decay))(y_a)

    m = Model(inputs=x_a, outputs=y_a)
    return m
示例#26
0
def raw_vgg(args,
            input_length=12000 * 29,
            tf='melgram',
            normalize=None,
            decibel=False,
            last_layer=True,
            sr=None):
    ''' when length = 12000*29 and 512/256 dft/hop,
    melgram size: (n_mels, 1360)
    '''
    assert tf in ('stft', 'melgram')
    assert normalize in (None, False, 'no', 0, 0.0, 'batch', 'data_sample',
                         'time', 'freq', 'channel')
    assert isinstance(decibel, bool)

    if sr is None:
        sr = SR  # assumes 12000
    conv_until = args.conv_until  # for intermediate layer outputting.
    trainable_kernel = args.trainable_kernel
    model = Sequential()
    if tf == 'stft':
        # decode args
        model.add(
            Spectrogram(n_dft=512,
                        n_hop=256,
                        power_spectrogram=2.0,
                        trainable_kernel=trainable_kernel,
                        return_decibel_spectrogram=decibel,
                        input_shape=(1, input_length)))
        poolings = [(2, 4), (4, 4), (4, 5), (2, 4), (4, 4)]
    elif tf == 'melgram':
        # decode args
        fmin = args.fmin
        fmax = args.fmax
        if fmax == 0.0:
            fmax = sr / 2
        n_mels = args.n_mels
        trainable_fb = args.trainable_fb
        # pdb.set_trace()
        model.add(
            Melspectrogram(n_dft=512,
                           n_hop=256,
                           power_melgram=2.0,
                           input_shape=(1, input_length),
                           trainable_kernel=trainable_kernel,
                           trainable_fb=trainable_fb,
                           return_decibel_melgram=decibel,
                           sr=sr,
                           n_mels=n_mels,
                           fmin=fmin,
                           fmax=fmax,
                           name='melgram'))
        if n_mels >= 256:
            poolings = [(2, 4), (4, 4), (4, 5), (2, 4), (4, 4)]
        elif n_mels >= 128:
            poolings = [(2, 4), (4, 4), (2, 5), (2, 4), (4, 4)]
        elif n_mels >= 96:
            poolings = [(2, 4), (3, 4), (2, 5), (2, 4), (4, 4)]
        elif n_mels >= 72:
            poolings = [(2, 4), (3, 4), (2, 5), (2, 4), (3, 4)]
        elif n_mels >= 64:
            poolings = [(2, 4), (2, 4), (2, 5), (2, 4), (4, 4)]
        elif n_mels >= 48:
            poolings = [(2, 4), (2, 4), (2, 5), (2, 4), (3, 4)]
        elif n_mels >= 32:
            poolings = [(2, 4), (2, 4), (2, 5), (2, 4), (2, 4)]
        elif n_mels >= 24:
            poolings = [(2, 4), (2, 4), (2, 5), (3, 4), (1, 4)]
        elif n_mels >= 18:
            poolings = [(2, 4), (1, 4), (3, 5), (1, 4), (3, 4)]
        elif n_mels >= 18:
            poolings = [(2, 4), (1, 4), (3, 5), (1, 4), (3, 4)]
        elif n_mels >= 16:
            poolings = [(2, 4), (2, 4), (2, 5), (2, 4), (1, 4)]
        elif n_mels >= 12:
            poolings = [(2, 4), (1, 4), (2, 5), (3, 4), (1, 4)]
        elif n_mels >= 8:
            poolings = [(2, 4), (1, 4), (2, 5), (2, 4), (1, 4)]
        elif n_mels >= 6:
            poolings = [(2, 4), (1, 4), (3, 5), (1, 4), (1, 4)]
        elif n_mels >= 4:
            poolings = [(2, 4), (1, 4), (2, 5), (1, 4), (1, 4)]
        elif n_mels >= 2:
            poolings = [(2, 4), (1, 4), (1, 5), (1, 4), (1, 4)]
        else:  # n_mels == 1
            poolings = [(1, 4), (1, 4), (1, 5), (1, 4), (1, 4)]

    else:
        raise RuntimeError('choose between stft or melgram, not %s' % str(tf))
    if normalize in ('batch', 'data_sample', 'time', 'freq', 'channel'):
        # pdb.set_trace()
        model.add(Normalization2D(normalize))
    args = [
        5, [32, 32, 32, 32, 32], 1.0, [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3)],
        poolings, 0.0, model.output_shape[1:]
    ]
    model.add(
        get_convBNeluMPdrop(*args, num_nin_layers=1, conv_until=conv_until))
    if conv_until != 4:
        model.add(GlobalAveragePooling2D())
    else:
        model.add(Flatten())
    if last_layer:
        model.add(Dense(50, activation='sigmoid'))
    return model