Пример #1
0
def raw_vgg(args,
            input_length=12000 * 29,
            tf='melgram',
            normalize=None,
            decibel=False,
            last_layer=True,
            sr=None):
    ''' when length = 12000*29 and 512/256 dft/hop, 
    melgram size: (n_mels, 1360)
    '''
    assert tf in ('stft', 'melgram')
    assert normalize in (None, False, 'no', 0, 0.0, 'batch', 'data_sample',
                         'time', 'freq', 'channel')
    assert isinstance(decibel, bool)

    if sr is None:
        sr = SR  # assumes 12000

    conv_until = args.conv_until
    trainable_kernel = args.trainable_kernel
    model = Sequential()
    # decode args
    fmin = args.fmin
    fmax = args.fmax
    if fmax == 0.0:
        fmax = sr / 2
    n_mels = args.n_mels
    trainable_fb = args.trainable_fb
    model.add(
        Melspectrogram(n_dft=512,
                       n_hop=256,
                       power_melgram=2.0,
                       input_shape=(1, input_length),
                       trainable_kernel=trainable_kernel,
                       trainable_fb=trainable_fb,
                       return_decibel_melgram=decibel,
                       sr=sr,
                       n_mels=n_mels,
                       fmin=fmin,
                       fmax=fmax,
                       name='melgram'))

    poolings = [(2, 4), (3, 4), (2, 5), (2, 4), (4, 4)]

    if normalize in ('batch', 'data_sample', 'time', 'freq', 'channel'):
        model.add(Normalization2D(normalize))
    model.add(
        get_convBNeluMPdrop(5, [32, 32, 32, 32, 32], [(3, 3), (3, 3), (3, 3),
                                                      (3, 3), (3, 3)],
                            poolings,
                            model.output_shape[1:],
                            conv_until=conv_until))
    if conv_until != 4:
        model.add(GlobalAveragePooling2D())
    else:
        model.add(Flatten())

    if last_layer:
        model.add(Dense(1, activation='linear'))
    return model
Пример #2
0
    def create_model(self, input_shape, nb_classes, n_filters, dropout,
                     **kwargs):
        model = Sequential()

        dropout = list(map(float, dropout.split(",")))
        if len(dropout) == 1:
            dropout = dropout * 4
        if len(dropout) != 4:
            raise Exception("Unexpected length of dropouts:{0}".format(
                len(dropout)))
        layers = [
            Normalization2D(str_axis='batch', input_shape=input_shape),
            Conv2D(filters=n_filters, kernel_size=(9, 9), activation='relu'),
            MaxPool2D((2, 2), strides=(2, 2)),
            Dropout(dropout[0]),
            Conv2D(filters=n_filters, kernel_size=(6, 6), activation='relu'),
            Dropout(dropout[1]),
            Conv2D(filters=n_filters, kernel_size=(3, 3), activation='relu'),
            MaxPool2D((1, 2), strides=(1, 2)),
            Dropout(dropout[1]),
            Conv2D(filters=n_filters, kernel_size=(3, 3), activation='relu'),
            Dropout(dropout[1]),
            Flatten(),
            Dense(1000, activation='relu'),
            Dropout(dropout[2]),
            Dense(1000, activation='relu'),
            Dropout(dropout[3]),
            Dense(nb_classes, activation='sigmoid')
        ]
        add_regularization(layers, kwargs)
        for l in layers:
            model.add(l)
        return model
def Conv2D(N_CLASSES=10, SR=16000, DT=1.0):
    i = layers.Input(shape=(1, int(SR*DT)), name='input')
    x = Melspectrogram(n_dft=512, n_hop=160,
                       padding='same', sr=SR, n_mels=128,
                       fmin=0.0, fmax=SR/2, power_melgram=1.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Conv2D(8, kernel_size=(7,7), activation='tanh', padding='same', name='conv2d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_1')(x)
    x = layers.Conv2D(16, kernel_size=(5,5), activation='relu', padding='same', name='conv2d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_2')(x)
    x = layers.Conv2D(16, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_2')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_3')(x)
    x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_3')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_4')(x)
    x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_4')(x)
    x = layers.Flatten(name='flatten')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x)
    o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='2d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
def LSTM(N_CLASSES=10, SR=16000, DT=1.0):
    i = layers.Input(shape=(1, int(SR*DT)), name='input')
    x = Melspectrogram(n_dft=512, n_hop=160,
                       padding='same', sr=SR, n_mels=128,
                       fmin=0.0, fmax=SR/2, power_melgram=1.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2,1,3), name='permute')(x)
    x = TimeDistributed(layers.Reshape((-1,)), name='reshape')(x)
    s = TimeDistributed(layers.Dense(64, activation='tanh'),
                        name='td_dense_tanh')(x)
    x = layers.Bidirectional(layers.LSTM(32, return_sequences=True),
                             name='bidirectional_lstm')(s)
    x = layers.concatenate([s, x], axis=2, name='skip_connection')
    x = layers.Dense(64, activation='relu', name='dense_1_relu')(x)
    x = layers.MaxPooling1D(name='max_pool_1d')(x)
    x = layers.Dense(32, activation='relu', name='dense_2_relu')(x)
    x = layers.Flatten(name='flatten')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(32, activation='relu',
                         activity_regularizer=l2(0.001),
                         name='dense_3_relu')(x)
    o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='long_short_term_memory')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Пример #5
0
def Conv1D(N_CLASSES=3, SR=5000, DT=0.25):
    i = layers.Input(shape=(1, int(SR*DT)), name='input')
    x = Melspectrogram(n_dft=512, n_hop=160,
                       padding='same', sr=SR, n_mels=128,
                       fmin=0.0, fmax=SR/2, power_melgram=2.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2,1,3), name='permute')(x)
    x = TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), name='max_pool_2d_1')(x)
    x = TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), name='max_pool_2d_2')(x)
    x = TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x)
    x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x)
    x = layers.Dropout(rate=0.1, name='dropout')(x)
    x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x)
    o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='1d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Пример #6
0
def AttRNNSpeechModel(nCategories, samplingrate=16000,
                      inputLength=16000, rnn_func=L.LSTM):
    # simple LSTM
    sr = samplingrate
    iLen = inputLength

    inputs = L.Input((inputLength,), name='input')

    x = L.Reshape((1, -1))(inputs)

    m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen),
                       padding='same', sr=sr, n_mels=80,
                       fmin=40.0, fmax=sr / 2, power_melgram=1.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')
    m.trainable = False

    x = m(x)

    x = Normalization2D(int_axis=0, name='mel_stft_norm')(x)

    # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    # we would rather have it the other way around for LSTMs

    x = L.Permute((2, 1, 3))(x)

    x = L.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)

    # x = Reshape((125, 80)) (x)
    # keras.backend.squeeze(x, axis)
    x = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x)

    x = L.Bidirectional(rnn_func(64, return_sequences=True)
                        )(x)  # [b_s, seq_len, vec_dim]
    x = L.Bidirectional(rnn_func(64, return_sequences=True)
                        )(x)  # [b_s, seq_len, vec_dim]

    xFirst = L.Lambda(lambda q: q[:, -1])(x)  # [b_s, vec_dim]
    query = L.Dense(128)(xFirst)

    # dot product attention
    attScores = L.Dot(axes=[1, 2])([query, x])
    attScores = L.Softmax(name='attSoftmax')(attScores)  # [b_s, seq_len]

    # rescale sequence
    attVector = L.Dot(axes=[1, 1])([attScores, x])  # [b_s, vec_dim]

    x = L.Dense(64, activation='relu')(attVector)
    x = L.Dense(32)(x)

    output = L.Dense(nCategories, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    return model
Пример #7
0
 def __spec_model(self, input_shape, decibel_gram):
     model = Sequential()
     model.add(Spectrogram(
         return_decibel_spectrogram = decibel_gram,
         input_shape=input_shape
     ))
     model.add(Normalization2D(str_axis='freq'))
     return model
Пример #8
0
def build_model_vggish(classes,
                       dropout_final=0.2,
                       shape=(None, 320000),
                       sr=16000,
                       rnn_type='gru',
                       rnn_units=256,
                       focal_alpha=0.95,
                       rnn_layers=1,
                       rnn_dropout=0.2,
                       activation='elu',
                       random_noise=0.2,
                       weights='soundnet'):

    inputs = keras.Input(shape=shape[1:])
    x = keras.layers.Reshape(target_shape=(1, -1))(inputs)
    x = Melspectrogram(n_dft=512,
                       n_hop=256,
                       padding='same',
                       sr=sr,
                       n_mels=64,
                       fmin=125,
                       fmax=7500,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       name='trainable_stft')(x)
    if random_noise:
        x = AdditiveNoise(power=random_noise, random_gain=True)(x)
    x = Normalization2D(str_axis='freq')(x)
    x = Lambda(lambda x: K.permute_dimensions(x=x, pattern=(0, 2, 1, 3)),
               name="transpose")(x)

    vggish = VGGish(include_top=False,
                    load_weights=weights,
                    input_shape=x.get_shape().as_list()[1:],
                    pooling=None)
    if weights is not None:  # only freeze when using pretrained layers
        for layer in vggish.layers:
            layer.trainable = False
    x = vggish(x)
    x = keras.layers.AveragePooling2D(pool_size=(1, 4))(x)
    x = keras.layers.Reshape(target_shape=(-1, 512))(x)

    outputs = rnn_classifier_branch(x,
                                    name='rnn',
                                    dropout=rnn_dropout,
                                    dropout_final=dropout_final,
                                    rnn_units=rnn_units,
                                    rnn_type=rnn_type,
                                    n_classes=len(classes),
                                    rnn_layers=rnn_layers)

    model = keras.Model(inputs=inputs, outputs=outputs, name='crnn')

    model.summary()

    return model, vggish
Пример #9
0
def stft_model(audio_len, normalize=True, **kwargs):
    """Build an STFT preprocessing model.
    
    Pass normalize=False to disable the normalization layer.
    Pass arguments to https://github.com/keunwoochoi/kapre/blob/master/kapre/time_frequency.py#L11."""
    return Sequential([
        Spectrogram(input_shape=(1, audio_len), **kwargs),
    ] + ([
        Normalization2D(str_axis='freq'),
    ] if normalize else []))
Пример #10
0
def ConvSpeechModel(nCategories, samplingrate=16000, inputLength=16000):
    """
    Base fully convolutional model for speech recognition
    """

    inputs = Input((inputLength, ))

    x = Reshape((1, -1))(inputs)

    x = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, inputLength),
                       padding='same',
                       sr=samplingrate,
                       n_mels=80,
                       fmin=40.0,
                       fmax=samplingrate / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')(x)

    x = Normalization2D(int_axis=0)(x)
    #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    #we would rather have it the other way around for LSTMs

    x = Permute((2, 1, 3))(x)
    #x = Reshape((94,80)) (x) #this is strange - but now we have (batch_size, sequence, vec_dim)

    c1 = Conv2D(20, (5, 1), activation='relu', padding='same')(x)
    c1 = BatchNormalization()(c1)
    p1 = MaxPooling2D((2, 1))(c1)
    p1 = Dropout(0.03)(p1)

    c2 = Conv2D(40, (3, 3), activation='relu', padding='same')(p1)
    c2 = BatchNormalization()(c2)
    p2 = MaxPooling2D((2, 2))(c2)
    p2 = Dropout(0.01)(p2)

    c3 = Conv2D(80, (3, 3), activation='relu', padding='same')(p2)
    c3 = BatchNormalization()(c3)
    p3 = MaxPooling2D((2, 2))(c3)

    p3 = Flatten()(p3)
    p3 = Dense(64, activation='relu')(p3)
    p3 = Dense(32, activation='relu')(p3)

    output = Dense(nCategories, activation='softmax')(p3)
    #output = Dense(nCategories, activation = 'softmax')(p1)

    model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel')

    return model
def attention_speech_model(num_category,
                           sampling_rate=16000,
                           input_length=16000):

    inputs = layers.Input((input_length, ), name='input')
    x = layers.Reshape((1, -1))(inputs)

    m = Melspectrogram(input_shape=(1, input_length),
                       n_dft=1024,
                       n_hop=128,
                       padding='same',
                       sr=sampling_rate,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sampling_rate / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_tft')
    m.trainable = False
    x = m(x)

    x = Normalization2D(int_axis=0, name='norm')(x)
    x = layers.Permute((2, 1, 3))(x)

    x = layers.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = layers.LeakyReLU()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Lambda(lambda t: K.squeeze(t, -1), name='squeeze_last_dim')(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)

    x_first = layers.Lambda(lambda t: t[:, t.shape[1] // 2])(x)
    query = layers.Dense(128)(x_first)

    attention_scores = layers.Dot([1, 2])([query, x])
    attention_scores = layers.Softmax(
        name='attention_softmax')(attention_scores)
    attention_vector = layers.Dot(axes=[1, 1])([attention_scores, x])

    x = layers.Dense(64)(attention_vector)
    x = layers.LeakyReLU()(x)
    x = layers.Dropout(0.5)(x)

    x = layers.Dense(32)(x)
    x = layers.Dropout(0.5)(x)

    out = layers.Dense(num_category, activation='softmax', name="output")(x)
    model = Model(inputs=inputs, outputs=out)
    return model
def Listen(input_length, parametres_melspectgtom, parametres_CNN,
           parametres_BRNN):
    '''
    parametres_melspectgtom : parametre de la couche Melspectrogram si length = 0 donc les audios sont déja mfcc
    parametres_BRNN : a list that contains the parameters of the cells  for each bidectionnel layer 
    then  number_layers is the len of this list parametres_BRNN 
    parametres_CNN: parametres used to build the CNN network after the inputs 
    input_length is the len of the input audios 
    '''
    number_layers = len(parametres_BRNN)
    encoder_inputs = L.Input(shape=(input_length, ))

    if parametres_melspectgtom["mfccs"] == False:
        #MELSPECTROGRAM Layer
        encoder_inputs = L.Input(shape=(input_length, ))
        encoder = L.Reshape((1, -1))(encoder_inputs)
        m = Build_MelSpectrogram(parametres_melspectgtom, input_length)
        encoder = m(encoder)
        encoder = Normalization2D(name='mel_stft_norm',
                                  str_axis='freq')(encoder)
        # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
        # we would rather have it the other way around for LSTMs (batch_size,timeSteps,melDim,1)
        encoder = L.Permute((2, 1, 3))(encoder)
        encoder = BuildCNN(parametres_CNN, encoder)
        encoder = L.Lambda(lambda q: K.squeeze(q, -1),
                           name='squeeze_last_dim')(encoder)

    else:
        encoder_inputs = L.Input(shape=(517, 13, 1))
        encoder = BuildCNN(parametres_CNN, encoder_inputs)
        encoder = L.Lambda(lambda q: K.squeeze(q, -1),
                           name='squeeze_last_dim')(encoder)

        #dans le cas ou nous avons des mfcc
    inputs = encoder

    encoder_state_fbw = None
    for parametre in parametres_BRNN:
        print(parametre)
        bltsm_layer = Build_Bidiractionnel_layer(parametre)
        encoder_outputs, forward_h, forward_c, backward_h, backward_c = bltsm_layer(
            inputs, initial_state=encoder_state_fbw)
        state_h = L.Concatenate()([forward_h, backward_h])
        state_c = L.Concatenate()([forward_c, backward_c])
        encoder_state_fbw = [forward_h, backward_h, forward_c, backward_c]
        inputs = L.Dropout(0.1)(encoder_outputs)
        print("end")

    #encoder_state = tuple(encoder_state_fbw * number_layers )
    print("shape of encoder_outupts ", encoder_outputs.shape)
    print("shape of encode_states  ", state_h.shape, state_c.shape)

    return encoder_inputs, encoder_outputs, encoder_state_fbw
Пример #13
0
def RNNSpeechModel(nCategories, samplingrate=16000, inputLength=16000):
    #simple LSTM
    sr = samplingrate
    iLen = inputLength

    inputs = Input((iLen, ))

    x = Reshape((1, -1))(inputs)

    x = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, iLen),
                       padding='same',
                       sr=sr,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')(x)

    x = Normalization2D(int_axis=0)(x)

    #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    #we would rather have it the other way around for LSTMs

    x = Permute((2, 1, 3))(x)

    x = Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)

    #x = Reshape((125, 80)) (x)
    x = Lambda(lambda q: K.squeeze(q, -1),
               name='squeeze_last_dim')(x)  #keras.backend.squeeze(x, axis)

    #x = Bidirectional(CuDNNLSTM(64, return_sequences = True)) (x) # [b_s, seq_len, vec_dim]
    #x = Bidirectional(CuDNNLSTM(64)) (x)

    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Bidirectional(LSTM(64))(x)

    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    output = Dense(nCategories, activation='softmax')(x)

    model = Model(inputs=[inputs], outputs=[output])

    return model
Пример #14
0
 def __mel_spec_model(self, input_shape, n_mels, power_melgram, decibel_gram):
     model = Sequential()
     model.add(Melspectrogram(
         sr=self._sr,
         n_mels=n_mels,
         power_melgram=power_melgram,
         return_decibel_melgram = decibel_gram,
         input_shape=input_shape,
         trainable_fb=False
     ))
     model.add(Normalization2D(str_axis='freq'))
     return model
Пример #15
0
def Conv1D(input_length, num_classes):
    i = layers.Input(shape=(1, input_length), name='input')
    x = Melspectrogram(n_dft=512,
                       n_hop=160,
                       padding='same',
                       sr=16000,
                       n_mels=128,
                       fmin=0.0,
                       fmax=16000 / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2, 1, 3), name='permute')(x)
    x = layers.TimeDistributed(layers.Conv1D(8,
                                             kernel_size=(4),
                                             activation='tanh'),
                               name='td_conv_1d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x)
    x = layers.TimeDistributed(layers.Conv1D(16,
                                             kernel_size=(4),
                                             activation='relu'),
                               name='td_conv_1d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x)
    x = layers.TimeDistributed(layers.Conv1D(32,
                                             kernel_size=(4),
                                             activation='relu'),
                               name='td_conv_1d_relu_2')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x)
    x = layers.TimeDistributed(layers.Conv1D(64,
                                             kernel_size=(4),
                                             activation='relu'),
                               name='td_conv_1d_relu_3')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_4')(x)
    x = layers.TimeDistributed(layers.Conv1D(128,
                                             kernel_size=(4),
                                             activation='relu'),
                               name='td_conv_1d_relu_4')(x)
    x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x)
    x = layers.Dropout(rate=0.1, name='dropout')(x)
    x = layers.Dense(64,
                     activation='relu',
                     activity_regularizer=l2(0.001),
                     name='dense')(x)

    o = layers.Dense(num_classes, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='1d_convolution')

    return model
Пример #16
0
def depth_separable_cnn(input_shape=(1, 16000),
                        sr=16000,
                        loss=keras.losses.categorical_crossentropy,
                        optimizer=keras.optimizers.adam()):
    model = Sequential()
    # A mel-spectrogram layer
    model.add(
        Melspectrogram(n_dft=512,
                       n_hop=512,
                       input_shape=input_shape,
                       padding='same',
                       sr=sr,
                       n_mels=128,
                       fmin=0.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='trainable_stft'))
    # Maybe some additive white noise.
    model.add(AdditiveNoise(power=0.1))
    # If you wanna normalise it per-frequency
    model.add(Normalization2D(
        str_axis='freq'))  # or 'channel', 'time', 'batch', 'data_sample'
    # After this, it's just a usual keras workflow. For example..
    # Add some layers, e.g., model.add(some convolution layers..)
    # Compile the model
    model.add(Conv2D(64, kernel_size=(20, 8), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), dim_ordering="th"))
    model.add(Dropout(0.25))
    ## Depth Seprable Pooling Layer - start
    model.add(
        SeparableConv2D(64,
                        kernel_size=(5, 5),
                        activation='relu',
                        dim_ordering="th"))
    model.add(BatchNormalization())
    model.add(
        Conv2D(64, kernel_size=(1, 1), activation='relu', dim_ordering="th"))
    model.add(BatchNormalization())
    model.add(SeparableConv2D(64, kernel_size=(5, 5), activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(64, kernel_size=(1, 1), activation='relu'))
    model.add(BatchNormalization())
    ## Depth Seprable pooling Layer - end
    model.add(AveragePooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(12, activation='softmax'))
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model
Пример #17
0
def train():
        pool_size = (2, 2) 
        # 350 samples
        input_shape = (channelCount, sampleCount) 
        sr = 44100
        model = Sequential()
        model.add(Melspectrogram(n_dft=512, n_hop=256, input_shape=input_shape,
                                 padding='same', sr=sr, n_mels=128,
                                 fmin=0.0, fmax=sr/2, power_melgram=1.0,
                                 return_decibel_melgram=False, trainable_fb=False,
                                 trainable_kernel=False,
                                 name='trainable_stft'))
        model.add(AdditiveNoise(power=0.2))
        model.add(Normalization2D(str_axis='freq')) # or 'channel', 'time', 'batch', 'data_sample'
        
        
        model.add(Convolution2D(32, 3, 3))
        model.add(BatchNormalization(axis=1 ))
        model.add(ELU(alpha=1.0))  
        model.add(MaxPooling2D(pool_size=pool_size))
        model.add(Dropout(0.25))        
        
        model.add(Flatten())
        model.add(Dense(128))
        model.add(Activation('relu'))
        model.add(Dropout(0.5))
        model.add(Dense(len(get_class_names())))
        model.add(Activation("softmax"))        
        
        model.compile('adam', 'categorical_crossentropy') 
        
        x,y=loadData(trainDataPath)
        
        
        
        checkpoint_filepath = 'weights.hdf5'
        print("Looking for previous weights...")
        if ( os.path.isfile(checkpoint_filepath) ):
                print ('Checkpoint file detected. Loading weights.')
                model.load_weights(checkpoint_filepath)
        else:
                print ('No checkpoint file detected.  Starting from scratch.')

        checkpointer = ModelCheckpoint(filepath=checkpoint_filepath, verbose=1, save_best_only=True)        
        test_x,test_y=loadData(testDataPath)
        
        model.fit(x, y,batch_size=128, nb_epoch=100,verbose=1,validation_data=(test_x, test_y), callbacks=[checkpointer])
        
        model.save(modelName)
Пример #18
0
def conv1d(input_shape, sr):
    i = layers.Input(shape=input_shape, name='input')
    x = Melspectrogram(n_dft=N_DFT,
                       n_hop=HOP_LENGTH,
                       padding='same',
                       sr=sr,
                       n_mels=N_MELS,
                       fmin=0.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2, 1, 3), name='permute')(x)
    x = TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'),
                        name='td_conv_1d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x)
    x = TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x)
    x = TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_2')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x)
    x = TimeDistributed(layers.Conv1D(64, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_3')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_4')(x)
    x = TimeDistributed(layers.Conv1D(128, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_4')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_5')(x)
    x = TimeDistributed(layers.Conv1D(256, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_5')(x)
    x = TimeDistributed(layers.Conv1D(512, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_6')(x)
    x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(64,
                     activation='relu',
                     activity_regularizer=l2(0.001),
                     name='dense')(x)
    o = layers.Dense(NUM_CLASSES, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='1d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Пример #19
0
def convolution_speech_model(num_category,
                             sampling_rate=16000,
                             input_length=16000):

    inputs = layers.Input((input_length, ))

    x = layers.Reshape((1, -1))(inputs)

    x = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, input_length),
                       padding='same',
                       sr=sampling_rate,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sampling_rate / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')(x)

    x = Normalization2D(int_axis=0)(x)

    x = layers.Permute((2, 1, 3))(x)

    c1 = layers.Conv2D(20, (5, 1), activation='relu', padding='same')(x)
    c1 = layers.BatchNormalization()(c1)
    p1 = layers.MaxPooling2D((2, 1))(c1)
    p1 = layers.Dropout(0.03)(p1)

    c2 = layers.Conv2D(40, (3, 3), activation='relu', padding='same')(p1)
    c2 = layers.BatchNormalization()(c2)
    p2 = layers.MaxPooling2D((2, 2))(c2)
    p2 = layers.Dropout(0.01)(p2)

    c3 = layers.Conv2D(80, (3, 3), activation='relu', padding='same')(p2)
    c3 = layers.BatchNormalization()(c3)
    p3 = layers.MaxPooling2D((2, 2))(c3)

    p3 = layers.Flatten()(p3)
    p3 = layers.Dense(64, activation='relu')(p3)
    p3 = layers.Dense(32, activation='relu')(p3)

    output = layers.Dense(num_category, activation='softmax')(p3)

    model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel')

    return model
Пример #20
0
def Att_RNN_Speech(x_train, y_train, classes, sampling_rate=16000, input_length=16000, batch_size=32, epochs=3):

  inputs = Input((input_length,))

  x = Reshape((1, -1))(inputs)

  m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, input_length),
                      padding='same', sr=sampling_rate, n_mels=80,
                      fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0,
                      return_decibel_melgram=True, trainable_fb=False,
                      trainable_kernel=False)
  m.trainable = False

  x = m(x)

  x = Normalization2D(int_axis=0)(x)

  x = Permute((2, 1, 3))(x)

  x = Conv2D(10, (5, 1), activation='relu', padding='same')(x)
  x = BatchNormalization()(x)
  x = Conv2D(1, (5, 1), activation='relu', padding='same')(x)
  x = BatchNormalization()(x)

  x = Lambda(lambda q: squeeze(q, -1))(x)

  x = Bidirectional(LSTM(64, return_sequences=True))(x)
  x = Bidirectional(LSTM(64, return_sequences=True))(x)

  xFirst = Lambda(lambda q: q[:, -1])(x)
  query = Dense(128)(xFirst)

  attScores = Dot(axes=[1, 2])([query, x])
  attScores = Softmax()(attScores)

  attVector = Dot(axes=[1, 1])([attScores, x])

  x = Dense(64, activation='relu')(attVector)
  x = Dense(32)(x)

  output = Dense(classes, activation='softmax')(x)

  model = Model(inputs=[inputs], outputs=[output])
  model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy'])
  model.summary()
  model.fit(x_train, validation_data=y_train, epochs=epochs, batch_size=batch_size,  use_multiprocessing=False, workers=4, verbose=2)
  model.save('Att_RNN_Speech.model')
Пример #21
0
 def build_CNN_model(self):
     ### define CNN architecture
     print('Build model...')
     self.model = Sequential()
     self.model.add(Spectrogram(n_dft=128, n_hop=16, input_shape=(self.x_augmented_rolled.shape[1:]),
                           return_decibel_spectrogram=False, power_spectrogram=2.0,
                           trainable_kernel=False, name='static_stft'))
     self.model.add(Normalization2D(str_axis = 'freq'))
     
     # Conv Block 1
     self.model.add(Conv2D(filters = 24, kernel_size = (12, 12), 
                      strides = (1, 1), name = 'conv1', 
                      border_mode = 'same'))
     self.model.add(BatchNormalization(axis = 1))
     self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2,2), padding = 'valid', 
                            data_format = 'channels_last'))
     self.model.add(Activation('relu'))
     self.model.add(Dropout(self.dropout))
     
     # Conv Block 2
     self.model.add(Conv2D(filters = 48, kernel_size = (8, 8),
                      name = 'conv2', border_mode = 'same'))
     self.model.add(BatchNormalization(axis = 1))
     self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2), padding = 'valid',
                            data_format = 'channels_last'))
     self.model.add(Activation('relu'))
     self.model.add(Dropout(self.dropout))
     
     # Conv Block 3
     self.model.add(Conv2D(filters = 96, kernel_size = (4, 4),
                      name = 'conv3', border_mode = 'same'))
     self.model.add(BatchNormalization(axis = 1))
     self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2,2), 
                            padding = 'valid',
                            data_format = 'channels_last'))
     self.model.add(Activation('relu'))
     self.model.add(Dropout(self.dropout))
     
     # classificator
     self.model.add(Flatten())
     self.model.add(Dense(self.n_classes))  # two classes only
     self.model.add(Activation('softmax'))
     
     print(self.model.summary())
     self.saved_model_name = self.MODELNAME
Пример #22
0
def LSTM(input_length, num_classes):
    i = layers.Input(shape=(1, input_length), name='input')
    x = Melspectrogram(n_dft=512,
                       n_hop=160,
                       padding='same',
                       sr=16000,
                       n_mels=128,
                       fmin=0.0,
                       fmax=16000 / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2, 1, 3), name='permute')(x)
    x = layers.TimeDistributed(layers.Reshape((-1, )), name='reshape')(x)
    s = layers.TimeDistributed(layers.Dense(64, activation='tanh'),
                               name='td_dense_tanh')(x)
    x = layers.Bidirectional(layers.LSTM(32, return_sequences=True),
                             name='bidirectional_lstm')(s)
    x = layers.concatenate([s, x], axis=2, name='skip_connection')
    x = layers.Dense(64, activation='relu', name='dense_1_relu')(x)
    x = layers.MaxPooling1D(name='max_pool_1d')(x)
    x = layers.Dense(32, activation='relu', name='dense_2_relu')(x)
    x = layers.Flatten(name='flatten')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(32,
                     activation='relu',
                     activity_regularizer=l2(0.001),
                     name='dense_3_relu')(x)

    o = layers.Dense(num_classes, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='long_short_term_memory')

    return model
def MLP_model(input_shape, dropout=0.5, print_summary=False):
    # basis of the CNN_STFT is a Sequential network
    model = Sequential()

    # spectrogram creation using STFT
    model.add(
        Spectrogram(n_dft=128,
                    n_hop=16,
                    input_shape=input_shape,
                    return_decibel_spectrogram=False,
                    power_spectrogram=2.0,
                    trainable_kernel=False,
                    name='static_stft'))
    model.add(Normalization2D(str_axis='freq'))
    model.add(Flatten())
    model.add(Dense(neurons_per_layer, activation='relu', input_shape=(784, )))
    model.add(Dropout(0.2))

    # custom number of hidden layers
    for each in range(n_hidden_layers - 1):
        model.add(Dense(neurons_per_layer, activation='relu'))
        model.add(Dropout(0.2))

    model.add(Dense(2))  # two classes only
    model.add(Activation('softmax'))

    if print_summary:
        print(model.summary())

    # compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # assign model and return

    return model
def CNN_model(input_shape, dropout=0.5, print_summary=False):
    # basis of the CNN_STFT is a Sequential network
    model = Sequential()

    # spectrogram creation using STFT
    model.add(
        Spectrogram(n_dft=128,
                    n_hop=16,
                    input_shape=input_shape,
                    return_decibel_spectrogram=False,
                    power_spectrogram=2.0,
                    trainable_kernel=False,
                    name='static_stft'))
    model.add(Normalization2D(str_axis='freq'))

    # Conv Block 1
    model.add(
        Conv2D(filters=24,
               kernel_size=(12, 12),
               strides=(1, 1),
               name='conv1',
               border_mode='same'))
    model.add(BatchNormalization(axis=1))
    model.add(Activation('relu'))
    model.add(
        MaxPooling2D(pool_size=(2, 2),
                     strides=(2, 2),
                     padding='valid',
                     data_format='channels_last'))

    # Conv Block 2
    model.add(
        Conv2D(filters=48,
               kernel_size=(8, 8),
               name='conv2',
               border_mode='same'))
    model.add(BatchNormalization(axis=1))
    model.add(Activation('relu'))
    model.add(
        MaxPooling2D(pool_size=(2, 2),
                     strides=(2, 2),
                     padding='valid',
                     data_format='channels_last'))

    # Conv Block 3
    model.add(
        Conv2D(filters=96,
               kernel_size=(4, 4),
               name='conv3',
               border_mode='same'))
    model.add(BatchNormalization(axis=1))
    model.add(Activation('relu'))
    model.add(
        MaxPooling2D(pool_size=(2, 2),
                     strides=(2, 2),
                     padding='valid',
                     data_format='channels_last'))
    model.add(Dropout(dropout))

    # classificator
    model.add(Flatten())
    model.add(Dense(2))  # two classes only
    model.add(Activation('softmax'))

    if print_summary:
        print(model.summary())

    # compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # assign model and return

    return model
Пример #25
0
def attRNN():
    sr = 8000
    inputs = Input((8000, 1), name='input')

    x = Reshape((1, -1))(inputs)

    m = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, 8000),
                       padding='same',
                       sr=sr,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')
    m.trainable = False

    x = m(x)

    x = Normalization2D(int_axis=0, name='mel_stft_norm')(x)

    # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    # we would rather have it the other way around for LSTMs

    x = Permute((2, 1, 3))(x)

    x = Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)

    # x = Reshape((125, 80)) (x)
    # keras.backend.squeeze(x, axis)
    x = Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x)

    x = Bidirectional(LSTM(64, return_sequences=True))(
        x)  # [b_s, seq_len, vec_dim]

    x = Bidirectional(LSTM(64, return_sequences=True))(
        x)  # [b_s, seq_len, vec_dim]

    xFirst = Lambda(lambda q: q[:, -1])(x)  # [b_s, vec_dim]
    query = Dense(128)(xFirst)

    # dot product attention
    attScores = Dot(axes=[1, 2])([query, x])
    attScores = Softmax(name='attSoftmax')(attScores)  # [b_s, seq_len]

    # rescale sequence
    attVector = Dot(axes=[1, 1])([attScores, x])  # [b_s, vec_dim]

    x = Dense(64, activation='relu')(attVector)
    x = Dense(32)(x)

    output = Dense(9, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    model.compile(optimizer='adam',
                  loss=['sparse_categorical_crossentropy'],
                  metrics=['sparse_categorical_accuracy'])
    model.summary()

    return model
Пример #26
0
def ConvSpeechModel(nCategories,
                    samplingrate=16000,
                    inputLength=16000,
                    more_blocks=False,
                    bigger_blocks=False,
                    blocks_layers=[20, 40, 80, 160, 320]):
    """
    Base fully convolutional model for speech recognition
    """

    inputs = L.Input((inputLength, ))

    x = L.Reshape((1, -1))(inputs)

    x = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, inputLength),
                       padding='same',
                       sr=samplingrate,
                       n_mels=80,
                       fmin=40.0,
                       fmax=samplingrate / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')(x)

    x = Normalization2D(int_axis=0)(x)
    # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    # we would rather have it the other way around for LSTMs

    x = L.Permute((2, 1, 3))(x)
    # x = Reshape((94,80)) (x) #this is strange - but now we have (batch_size,
    # sequence, vec_dim)

    c1 = L.Conv2D(blocks_layers[0], (5, 1), activation='relu',
                  padding='same')(x)
    c1 = L.BatchNormalization()(c1)
    p1 = L.MaxPooling2D((2, 1))(c1)
    p1 = L.Dropout(0.2)(p1)

    c2 = L.Conv2D(blocks_layers[1], (3, 3), activation='relu',
                  padding='same')(p1)
    c2 = L.BatchNormalization()(c2)
    if bigger_blocks:
        c2 = L.Conv2D(blocks_layers[1], (3, 3),
                      activation='relu',
                      padding='same')(c2)
        c2 = L.BatchNormalization()(c2)
    p2 = L.MaxPooling2D((2, 2))(c2)
    p2 = L.Dropout(0.3)(p2)

    c3 = L.Conv2D(blocks_layers[2], (3, 3), activation='relu',
                  padding='same')(p2)
    c3 = L.BatchNormalization()(c3)
    if bigger_blocks:
        c3 = L.Conv2D(blocks_layers[2], (3, 3),
                      activation='relu',
                      padding='same')(c3)
        c3 = L.BatchNormalization()(c3)
    p3 = L.MaxPooling2D((2, 2))(c3)
    #     p3 = L.Dropout(0.3)(p3)

    if more_blocks:
        p3 = L.Dropout(0.3)(p3)
        c3 = L.Conv2D(blocks_layers[3], (3, 3),
                      activation='relu',
                      padding='same')(p3)
        c3 = L.BatchNormalization()(c3)
        if bigger_blocks:
            c3 = L.Conv2D(blocks_layers[3], (3, 3),
                          activation='relu',
                          padding='same')(c3)
            c3 = L.BatchNormalization()(c3)
        p3 = L.MaxPooling2D((2, 2))(c3)

    p3 = L.Flatten()(p3)
    p3 = L.Dense(64, activation='relu')(p3)
    p3 = L.Dropout(0.4)(p3)

    output = L.Dense(nCategories, activation='softmax')(p3)

    model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel')

    return model
Пример #27
0
from kapre.utils import Normalization2D
from kapre.augmentation import AdditiveNoise
import keras
from keras import optimizers
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


classifier = Sequential()

classifier.add(Spectrogram(n_dft=512, n_hop=256,padding='same',input_shape=input_shape, 
	power_spectrogram=2.0,return_decibel_spectrogram=False, trainable_kernel=False,image_data_format='default'))

classifier.add(AdditiveNoise(power=0.2))
classifier.add(Normalization2D(str_axis='freq'))
#Layer 1
classifier.add(Conv2D(24, (1, 1), input_shape = (7192,11, 1000), activation = 'relu'))
keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
classifier.add(MaxPooling2D(pool_size = (2, 2)))
classifier.add(Dense(units = 128, activation = 'relu'))
keras.layers.Dropout(0.5, noise_shape=None, seed=None)
#Layer 2
classifier.add(Conv2D(48, (1,1), input_shape = (32, 32,24), activation = 'relu'))
keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
classifier.add(MaxPooling2D(pool_size = (2, 2)))
classifier.add(Dense(units = 128, activation = 'relu'))
keras.layers.Dropout(0.5, noise_shape=None, seed=None)
#Layer 3
classifier.add(Conv2D(96, (1,1), input_shape = (32, 32,24), activation = 'relu'))
keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
Пример #28
0
melspecModel.add(
    Melspectrogram(n_dft=1024,
                   n_hop=128,
                   input_shape=(1, iLen),
                   padding='same',
                   sr=sr,
                   n_mels=80,
                   fmin=40.0,
                   fmax=sr / 2,
                   power_melgram=1.0,
                   return_decibel_melgram=True,
                   trainable_fb=False,
                   trainable_kernel=False,
                   name='mel_stft'))

melspecModel.add(Normalization2D(int_axis=0))

melspecModel.summary()

# In[45]:

#melspec = melspecModel.predict( audios.reshape((-1,1,iLen)) )
#melspec.shape

# # Models
#
# Create Keras models to see if the generators are working properly

# In[46]:

from keras.models import Model, load_model
Пример #29
0
def add_mel_to_VGGish(content_weights_file_path_og, input_length, sr_hr,
                      n_mels, hoplength, nfft, fmin, fmax, power_melgram):

    NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
    NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
    EMBEDDING_SIZE = 128  # Size of embedding layer.
    pooling = 'avg'
    X = Input(shape=(NUM_FRAMES, NUM_BANDS, 1), name='nob')
    x = X
    x = Conv2D(64, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv1')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)

    # Block 2
    x = Conv2D(128, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)

    # Block 3
    x = Conv2D(256, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv3/conv3_1')(x)
    x = Conv2D(256, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv3/conv3_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)

    # Block 4
    x = Conv2D(512, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv4/conv4_1')(x)
    x = Conv2D(512, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv4/conv4_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)

    if pooling == 'avg':
        x = GlobalAveragePooling2D()(x)
    elif pooling == 'max':
        x = GlobalMaxPooling2D()(x)
    model = Model(inputs=X, outputs=x)

    model.load_weights(content_weights_file_path_og)

    X = Input(shape=(1, input_length), name='input_1')
    x = X
    x = Spectrogram(n_dft=nfft,
                    n_hop=hoplength,
                    padding='same',
                    return_decibel_spectrogram=True,
                    trainable_kernel=False,
                    name='stft')(x)

    x = Normalization2D(str_axis='freq')(x)

    no_input_layers = model.layers[1:]

    for layer in no_input_layers:
        x = layer(x)
    return Model(inputs=X, outputs=x)
Пример #30
0
def create_VGGish(input_length,
                  sr_hr,
                  n_mels,
                  hoplength,
                  nfft,
                  fmin,
                  fmax,
                  power_melgram,
                  pooling='avg'):

    X = Input(shape=(input_length, 1), name='input_1')

    x = X

    x = Reshape((1, input_length))(x)

    x = Spectrogram(n_dft=nfft,
                    n_hop=hoplength,
                    padding='same',
                    return_decibel_spectrogram=True,
                    trainable_kernel=False,
                    name='stft')(x)

    x = Normalization2D(str_axis='freq')(x)

    x = Conv2D(64, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv1')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)

    x = Conv2D(128, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)

    x = Conv2D(256, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv3/conv3_1')(x)
    x = Conv2D(256, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv3/conv3_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)

    x = Conv2D(512, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv4/conv4_1')(x)
    x = Conv2D(512, (3, 3),
               strides=(1, 1),
               activation='relu',
               padding='same',
               name='conv4/conv4_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)

    if pooling == 'avg':
        x = GlobalAveragePooling2D()(x)
    elif pooling == 'max':
        x = GlobalMaxPooling2D()(x)

    return X, x