Exemplo n.º 1
0
def createMelSpectrogram(input_path, fileName, output_path, saveOrShow=0):
    
    # load sound signal
    signal, sr = librosa.load(os.path.join(input_path, fileName), duration=10, sr=16000)
    
    #signal = filter_signal(signal, sr, target_audio_length)
    
    # create Mel Spectrogram
    S = Melspectrogram(n_dft=1024, 
                       n_hop=320,
                       #n_hop=256,
                       input_shape=(1, signal.shape[0]),
                       padding='same', sr=sr, n_mels=224, fmin=1400, fmax=sr/2,
                       power_melgram=2.0, return_decibel_melgram=True,
                       trainable_fb=False, trainable_kernel=False)(signal.reshape(1, 1, -1)).numpy()
    
    S = S.reshape(S.shape[1], S.shape[2])
    
    print(S.shape)
    
    if saveOrShow == 0:   
        matplotlib.image.imsave(os.path.join(output_path, fileName.split(".")[0] + ".png"), S, cmap='inferno')
    else:
        #plt.imshow(S)
        #plt.show()
        display.specshow(S, sr=sr)
        plt.show()
Exemplo n.º 2
0
def AttRNNSpeechModel(nCategories, samplingrate=16000,
                      inputLength=16000, rnn_func=L.LSTM):
    # simple LSTM
    sr = samplingrate
    iLen = inputLength

    inputs = L.Input((inputLength,), name='input')

    x = L.Reshape((1, -1))(inputs)

    m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen),
                       padding='same', sr=sr, n_mels=80,
                       fmin=40.0, fmax=sr / 2, power_melgram=1.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')
    m.trainable = False

    x = m(x)

    x = Normalization2D(int_axis=0, name='mel_stft_norm')(x)

    # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    # we would rather have it the other way around for LSTMs

    x = L.Permute((2, 1, 3))(x)

    x = L.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)

    # x = Reshape((125, 80)) (x)
    # keras.backend.squeeze(x, axis)
    x = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x)

    x = L.Bidirectional(rnn_func(64, return_sequences=True)
                        )(x)  # [b_s, seq_len, vec_dim]
    x = L.Bidirectional(rnn_func(64, return_sequences=True)
                        )(x)  # [b_s, seq_len, vec_dim]

    xFirst = L.Lambda(lambda q: q[:, -1])(x)  # [b_s, vec_dim]
    query = L.Dense(128)(xFirst)

    # dot product attention
    attScores = L.Dot(axes=[1, 2])([query, x])
    attScores = L.Softmax(name='attSoftmax')(attScores)  # [b_s, seq_len]

    # rescale sequence
    attVector = L.Dot(axes=[1, 1])([attScores, x])  # [b_s, vec_dim]

    x = L.Dense(64, activation='relu')(attVector)
    x = L.Dense(32)(x)

    output = L.Dense(nCategories, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    return model
Exemplo n.º 3
0
def build_model_vggish(classes,
                       dropout_final=0.2,
                       shape=(None, 320000),
                       sr=16000,
                       rnn_type='gru',
                       rnn_units=256,
                       focal_alpha=0.95,
                       rnn_layers=1,
                       rnn_dropout=0.2,
                       activation='elu',
                       random_noise=0.2,
                       weights='soundnet'):

    inputs = keras.Input(shape=shape[1:])
    x = keras.layers.Reshape(target_shape=(1, -1))(inputs)
    x = Melspectrogram(n_dft=512,
                       n_hop=256,
                       padding='same',
                       sr=sr,
                       n_mels=64,
                       fmin=125,
                       fmax=7500,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       name='trainable_stft')(x)
    if random_noise:
        x = AdditiveNoise(power=random_noise, random_gain=True)(x)
    x = Normalization2D(str_axis='freq')(x)
    x = Lambda(lambda x: K.permute_dimensions(x=x, pattern=(0, 2, 1, 3)),
               name="transpose")(x)

    vggish = VGGish(include_top=False,
                    load_weights=weights,
                    input_shape=x.get_shape().as_list()[1:],
                    pooling=None)
    if weights is not None:  # only freeze when using pretrained layers
        for layer in vggish.layers:
            layer.trainable = False
    x = vggish(x)
    x = keras.layers.AveragePooling2D(pool_size=(1, 4))(x)
    x = keras.layers.Reshape(target_shape=(-1, 512))(x)

    outputs = rnn_classifier_branch(x,
                                    name='rnn',
                                    dropout=rnn_dropout,
                                    dropout_final=dropout_final,
                                    rnn_units=rnn_units,
                                    rnn_type=rnn_type,
                                    n_classes=len(classes),
                                    rnn_layers=rnn_layers)

    model = keras.Model(inputs=inputs, outputs=outputs, name='crnn')

    model.summary()

    return model, vggish
def attention_speech_model(num_category,
                           sampling_rate=16000,
                           input_length=16000):

    inputs = layers.Input((input_length, ), name='input')
    x = layers.Reshape((1, -1))(inputs)

    m = Melspectrogram(input_shape=(1, input_length),
                       n_dft=1024,
                       n_hop=128,
                       padding='same',
                       sr=sampling_rate,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sampling_rate / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_tft')
    m.trainable = False
    x = m(x)

    x = Normalization2D(int_axis=0, name='norm')(x)
    x = layers.Permute((2, 1, 3))(x)

    x = layers.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = layers.LeakyReLU()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Lambda(lambda t: K.squeeze(t, -1), name='squeeze_last_dim')(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)

    x_first = layers.Lambda(lambda t: t[:, t.shape[1] // 2])(x)
    query = layers.Dense(128)(x_first)

    attention_scores = layers.Dot([1, 2])([query, x])
    attention_scores = layers.Softmax(
        name='attention_softmax')(attention_scores)
    attention_vector = layers.Dot(axes=[1, 1])([attention_scores, x])

    x = layers.Dense(64)(attention_vector)
    x = layers.LeakyReLU()(x)
    x = layers.Dropout(0.5)(x)

    x = layers.Dense(32)(x)
    x = layers.Dropout(0.5)(x)

    out = layers.Dense(num_category, activation='softmax', name="output")(x)
    model = Model(inputs=inputs, outputs=out)
    return model
Exemplo n.º 5
0
def Conv1D(N_CLASSES=3, SR=5000, DT=0.25):
    i = layers.Input(shape=(1, int(SR*DT)), name='input')
    x = Melspectrogram(n_dft=512, n_hop=160,
                       padding='same', sr=SR, n_mels=128,
                       fmin=0.0, fmax=SR/2, power_melgram=2.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2,1,3), name='permute')(x)
    x = TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), name='max_pool_2d_1')(x)
    x = TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), name='max_pool_2d_2')(x)
    x = TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x)
    x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x)
    x = layers.Dropout(rate=0.1, name='dropout')(x)
    x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x)
    o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='1d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Exemplo n.º 6
0
    def deep_net_i(self, feature_count):
        # Create Model
        model = Sequential()
        model.add(
            Melspectrogram(sr=SR,
                           n_mels=128,
                           power_melgram=1.0,
                           input_shape=(1, feature_count),
                           trainable_fb=False,
                           fmin=800,
                           fmax=8000))
        model.add(Convolution2D(32, 9, 9, name='conv1', activation='relu'))
        model.add(MaxPooling2D((25, 17)))
        model.add(Flatten())
        model.add(Dense(32, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(32, kernel_initializer='normal', activation='softmax'))

        # Compile model
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

        return model
def LSTM(N_CLASSES=10, SR=16000, DT=1.0):
    i = layers.Input(shape=(1, int(SR*DT)), name='input')
    x = Melspectrogram(n_dft=512, n_hop=160,
                       padding='same', sr=SR, n_mels=128,
                       fmin=0.0, fmax=SR/2, power_melgram=1.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2,1,3), name='permute')(x)
    x = TimeDistributed(layers.Reshape((-1,)), name='reshape')(x)
    s = TimeDistributed(layers.Dense(64, activation='tanh'),
                        name='td_dense_tanh')(x)
    x = layers.Bidirectional(layers.LSTM(32, return_sequences=True),
                             name='bidirectional_lstm')(s)
    x = layers.concatenate([s, x], axis=2, name='skip_connection')
    x = layers.Dense(64, activation='relu', name='dense_1_relu')(x)
    x = layers.MaxPooling1D(name='max_pool_1d')(x)
    x = layers.Dense(32, activation='relu', name='dense_2_relu')(x)
    x = layers.Flatten(name='flatten')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(32, activation='relu',
                         activity_regularizer=l2(0.001),
                         name='dense_3_relu')(x)
    o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='long_short_term_memory')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Exemplo n.º 8
0
def raw_vgg(args,
            input_length=12000 * 29,
            tf='melgram',
            normalize=None,
            decibel=False,
            last_layer=True,
            sr=None):
    ''' when length = 12000*29 and 512/256 dft/hop, 
    melgram size: (n_mels, 1360)
    '''
    assert tf in ('stft', 'melgram')
    assert normalize in (None, False, 'no', 0, 0.0, 'batch', 'data_sample',
                         'time', 'freq', 'channel')
    assert isinstance(decibel, bool)

    if sr is None:
        sr = SR  # assumes 12000

    conv_until = args.conv_until
    trainable_kernel = args.trainable_kernel
    model = Sequential()
    # decode args
    fmin = args.fmin
    fmax = args.fmax
    if fmax == 0.0:
        fmax = sr / 2
    n_mels = args.n_mels
    trainable_fb = args.trainable_fb
    model.add(
        Melspectrogram(n_dft=512,
                       n_hop=256,
                       power_melgram=2.0,
                       input_shape=(1, input_length),
                       trainable_kernel=trainable_kernel,
                       trainable_fb=trainable_fb,
                       return_decibel_melgram=decibel,
                       sr=sr,
                       n_mels=n_mels,
                       fmin=fmin,
                       fmax=fmax,
                       name='melgram'))

    poolings = [(2, 4), (3, 4), (2, 5), (2, 4), (4, 4)]

    if normalize in ('batch', 'data_sample', 'time', 'freq', 'channel'):
        model.add(Normalization2D(normalize))
    model.add(
        get_convBNeluMPdrop(5, [32, 32, 32, 32, 32], [(3, 3), (3, 3), (3, 3),
                                                      (3, 3), (3, 3)],
                            poolings,
                            model.output_shape[1:],
                            conv_until=conv_until))
    if conv_until != 4:
        model.add(GlobalAveragePooling2D())
    else:
        model.add(Flatten())

    if last_layer:
        model.add(Dense(1, activation='linear'))
    return model
def Conv2D(N_CLASSES=10, SR=16000, DT=1.0):
    i = layers.Input(shape=(1, int(SR*DT)), name='input')
    x = Melspectrogram(n_dft=512, n_hop=160,
                       padding='same', sr=SR, n_mels=128,
                       fmin=0.0, fmax=SR/2, power_melgram=1.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Conv2D(8, kernel_size=(7,7), activation='tanh', padding='same', name='conv2d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_1')(x)
    x = layers.Conv2D(16, kernel_size=(5,5), activation='relu', padding='same', name='conv2d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_2')(x)
    x = layers.Conv2D(16, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_2')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_3')(x)
    x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_3')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_4')(x)
    x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_4')(x)
    x = layers.Flatten(name='flatten')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x)
    o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='2d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Exemplo n.º 10
0
def Att_RNN_Speech(x_train, y_train, classes, sampling_rate=16000, input_length=16000, batch_size=32, epochs=3):

  inputs = Input((input_length,))

  x = Reshape((1, -1))(inputs)

  m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, input_length),
                      padding='same', sr=sampling_rate, n_mels=80,
                      fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0,
                      return_decibel_melgram=True, trainable_fb=False,
                      trainable_kernel=False)
  m.trainable = False

  x = m(x)

  x = Normalization2D(int_axis=0)(x)

  x = Permute((2, 1, 3))(x)

  x = Conv2D(10, (5, 1), activation='relu', padding='same')(x)
  x = BatchNormalization()(x)
  x = Conv2D(1, (5, 1), activation='relu', padding='same')(x)
  x = BatchNormalization()(x)

  x = Lambda(lambda q: squeeze(q, -1))(x)

  x = Bidirectional(LSTM(64, return_sequences=True))(x)
  x = Bidirectional(LSTM(64, return_sequences=True))(x)

  xFirst = Lambda(lambda q: q[:, -1])(x)
  query = Dense(128)(xFirst)

  attScores = Dot(axes=[1, 2])([query, x])
  attScores = Softmax()(attScores)

  attVector = Dot(axes=[1, 1])([attScores, x])

  x = Dense(64, activation='relu')(attVector)
  x = Dense(32)(x)

  output = Dense(classes, activation='softmax')(x)

  model = Model(inputs=[inputs], outputs=[output])
  model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy'])
  model.summary()
  model.fit(x_train, validation_data=y_train, epochs=epochs, batch_size=batch_size,  use_multiprocessing=False, workers=4, verbose=2)
  model.save('Att_RNN_Speech.model')
Exemplo n.º 11
0
    def __init__(self, n_hop):
        i = Input(shape=(1, L))
        mel = Melspectrogram(sr=L,
                             n_mels=128,
                             n_dft=2048,
                             n_hop=n_hop,
                             power_melgram=2.0,
                             return_decibel_melgram=True,
                             trainable_fb=False,
                             trainable_kernel=False,
                             input_shape=(1, L))(i)
        norm_i = BatchNormalization()(mel)

        conv1 = Conv2D(filters=16, kernel_size=(3, 3), padding='same')(norm_i)
        bn1 = BatchNormalization()(conv1)
        relu1 = Activation(activation='relu')(bn1)
        res1 = self.resblock(z=relu1, n_in=16, n_out=16)
        pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(res1)
        drp1 = Dropout(rate=0.1)(pool1)

        conv2 = Conv2D(filters=32, kernel_size=(3, 3), padding='same')(drp1)
        bn2 = BatchNormalization()(conv2)
        relu2 = Activation(activation='relu')(bn2)
        res2 = self.resblock(z=relu2, n_in=32, n_out=32)
        res3 = self.resblock(z=res2, n_in=32, n_out=32)
        pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(res3)
        drp2 = Dropout(rate=0.2)(pool2)

        conv3 = Conv2D(filters=64, kernel_size=(3, 3), padding='same')(drp2)
        bn3 = BatchNormalization()(conv3)
        relu3 = Activation(activation='relu')(bn3)
        res4 = self.resblock(z=relu3, n_in=64, n_out=64)
        res5 = self.resblock(z=res4, n_in=64, n_out=64)
        pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(res5)
        drp3 = Dropout(rate=0.2)(pool3)

        conv4 = Conv2D(filters=128, kernel_size=(3, 3), padding='same')(drp3)
        bn4 = BatchNormalization()(conv4)
        relu4 = Activation(activation='relu')(bn4)
        res6 = self.resblock(z=relu4, n_in=128, n_out=128)
        res7 = self.resblock(z=res6, n_in=128, n_out=128)
        drp4 = Dropout(rate=0.2)(res7)

        conv5 = Conv2D(filters=256, kernel_size=(3, 3), padding='same')(drp4)
        bn5 = BatchNormalization()(conv5)
        relu5 = Activation(activation='relu')(bn5)
        pool5 = GlobalAveragePooling2D()(relu5)

        dense1 = Dense(units=256, activation=relu)(pool5)
        drp5 = Dropout(rate=0.2)(dense1)
        out = Dense(units=N_CLASS, activation=softmax)(drp5)

        model = Model(inputs=[i], outputs=out)
        opt = optimizers.Adam()
        model.compile(optimizer=opt,
                      loss=losses.binary_crossentropy,
                      metrics=[categorical_accuracy])

        self.model = model
Exemplo n.º 12
0
def model_mfcc_layer(x_train, num_labels):
    sr = 22050
    model_input = x = Input(shape=x_train[0].shape)
    x = Melspectrogram(n_dft=512,
                       n_hop=sr // 128 + 1,
                       padding='same',
                       sr=sr,
                       n_mels=128,
                       fmin=0.0,
                       fmax=sr / 2,
                       power_melgram=2.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='trainable_stft')(x)
    # x = Spectrogram(n_dft=512, n_hop=sr // 128 + 1,
    #       return_decibel_spectrogram=False, power_spectrogram=2.0,
    #       trainable_kernel=False, name='static_stft')(x)
    # x = Normalization2D(str_axis='freq')(x)
    # x = AdditiveNoise(power=0.3)(x)
    x = Conv2D(filters=16, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(filters=32, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(filters=64, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(filters=128, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)
    x = MaxPooling2D(pool_size=2)(x)

    x = Conv2D(filters=256, kernel_size=filter_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = activation()(x)

    x = AveragePooling2D(pool_size=(int(x.get_shape()[1]),
                                    int(x.get_shape()[2])))(x)

    x = Conv2D(filters=num_labels,
               kernel_size=1,
               padding='valid',
               activation='softmax' if num_labels > 1 else 'relu')(x)

    model = Model(inputs=[model_input], outputs=[x])

    model.summary()
    return model
def Build_MelSpectrogram(Parametres_layer, input_length):

    mel_layer = Melspectrogram(n_dft=Parametres_layer["n_dft"],
                               n_hop=Parametres_layer["n_hop"],
                               input_shape=(1, input_length),
                               padding=Parametres_layer["padding"],
                               sr=Parametres_layer["sr"],
                               n_mels=Parametres_layer["n_mels"],
                               fmin=40.0,
                               fmax=Parametres_layer["sr"] / 2,
                               power_melgram=1.0,
                               return_decibel_melgram=True,
                               trainable_fb=False,
                               trainable_kernel=False,
                               name='mel_stft')
    mel_layer.trainable = False

    return mel_layer
Exemplo n.º 14
0
def model_conv3x3_ismir2016_choi(n_out,
                                 input_shape=INPUT_SHAPE,
                                 out_activation='softmax'):
    """ A simplified model of 
    Automatic Tagging Using Deep Convolutional Neural Networks,
    K Choi, G Fazekas, M Sandler, ISMIR, 2016, New York, USA

    Symbolic summary:
    > c2 - p2 - c2 - p2 - c2 - p2 - c2 - p2 - c2 - p3 - d1

    Modifications: 
        * n_mels (96 -> 32)
        * n_channels (many -> [16, 24, 32, 40, 48])
        * remove dropout
        * maxpooling (irregular to fit the size -> all (2, 2))
        * add GlobalAveragePooling2D
    """

    model = Sequential()
    model.add(
        Melspectrogram(sr=SR,
                       n_mels=64,
                       power_melgram=2.0,
                       return_decibel_melgram=True,
                       input_shape=input_shape))
    model.add(BatchNormalization(axis=channel_axis))

    model.add(Conv2D(10, (3, 3), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))

    model.add(Conv2D(15, (3, 3), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))

    model.add(Conv2D(15, (3, 3), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))

    model.add(Conv2D(20, (3, 3), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))

    model.add(Conv2D(20, (3, 3), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((2, 2), padding='same'))

    model.add(GlobalAveragePooling2D())

    model.add(Dense(n_out, activation=out_activation))

    return model
Exemplo n.º 15
0
def model_convrnn(n_out, input_shape=(1, None), out_activation='softmax'):
    """No reference, just ConvRNN.

    Symbolic summary:
    > c2 - c2 - c2 - c2 - r2 - r2 - d1

    Parameters
    ----------
        n_out: integer, number of output nodes
        input_shape: tuple, an input shape, which doesn't include batch-axis.
                     (1, None) means (mono channel, variable length).
        out_activation: activation function on the output

    """
    assert input_shape[0] == 1, 'Mono input please!'
    model = Sequential()
    n_mels = 64
    model.add(
        Melspectrogram(sr=SR,
                       n_mels=n_mels,
                       power_melgram=2.0,
                       return_decibel_melgram=True,
                       input_shape=input_shape))
    model.add(Conv2D(32, (3, 3), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))

    model.add(Conv2D(32, (3, 3), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))

    model.add(Conv2D(16, (3, 3), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))

    model.add(Conv2D(1, (1, 1), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))

    if K.image_dim_ordering() == 'channels_first':  # (ch, freq, time)
        model.add(Permute((3, 2, 1)))  # (time, freq, ch)
    else:  # (freq, time, ch)
        model.add(Permute((2, 1, 3)))  # (time, ch, freq)

    # model.add(Reshape((-1, n_mels * n_ch))) # (time, ch * freq)
    # Reshape for LSTM
    model.add(
        Lambda(lambda x: K.squeeze(x, axis=3),
               output_shape=squeeze_output_shape))

    model.add(LSTM(25, return_sequences=True))
    model.add(LSTM(25, return_sequences=True))

    model.add(TimeDistributed(Dense(n_out, activation=out_activation)))

    return model
Exemplo n.º 16
0
def model_multi_kernel_shape(n_out,
                             input_shape=INPUT_SHAPE,
                             out_activation='softmax'):
    """

    Symbolic summary:
    > c2' - p2 - c2 - p2 - c2 - p2 - c2 - p3 - d1
    where c2' -> multiple kernel shapes

    Parameters
    ----------
        n_out: integer, number of output nodes
        input_shape: tuple, an input shape, which doesn't include batch-axis.
        out_activation: activation function on the output
    """
    audio_input = Input(shape=input_shape)

    x = Melspectrogram(sr=SR,
                       n_mels=64,
                       power_melgram=2.0,
                       return_decibel_melgram=True)(audio_input)
    x = BatchNormalization(axis=channel_axis)(x)

    x1 = Conv2D(7, (20, 3), padding='same')(x)
    x2 = Conv2D(7, (3, 3), padding='same')(x)
    x3 = Conv2D(7, (3, 20), padding='same')(x)

    x = Concatenate(axis=channel_axis)([x1, x2, x3])

    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)

    x = Conv2D(21, (3, 3), padding='same')(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)

    x = Conv2D(21, (3, 3), padding='same')(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)

    x = Conv2D(21, (3, 3), padding='same')(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((4, 4), padding='same')(x)

    x = GlobalAveragePooling2D()(x)

    out = Dense(n_out, activation=out_activation)(x)

    model = Model(audio_input, out)

    return model
Exemplo n.º 17
0
def ConvSpeechModel(nCategories, samplingrate=16000, inputLength=16000):
    """
    Base fully convolutional model for speech recognition
    """

    inputs = Input((inputLength, ))

    x = Reshape((1, -1))(inputs)

    x = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, inputLength),
                       padding='same',
                       sr=samplingrate,
                       n_mels=80,
                       fmin=40.0,
                       fmax=samplingrate / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')(x)

    x = Normalization2D(int_axis=0)(x)
    #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    #we would rather have it the other way around for LSTMs

    x = Permute((2, 1, 3))(x)
    #x = Reshape((94,80)) (x) #this is strange - but now we have (batch_size, sequence, vec_dim)

    c1 = Conv2D(20, (5, 1), activation='relu', padding='same')(x)
    c1 = BatchNormalization()(c1)
    p1 = MaxPooling2D((2, 1))(c1)
    p1 = Dropout(0.03)(p1)

    c2 = Conv2D(40, (3, 3), activation='relu', padding='same')(p1)
    c2 = BatchNormalization()(c2)
    p2 = MaxPooling2D((2, 2))(c2)
    p2 = Dropout(0.01)(p2)

    c3 = Conv2D(80, (3, 3), activation='relu', padding='same')(p2)
    c3 = BatchNormalization()(c3)
    p3 = MaxPooling2D((2, 2))(c3)

    p3 = Flatten()(p3)
    p3 = Dense(64, activation='relu')(p3)
    p3 = Dense(32, activation='relu')(p3)

    output = Dense(nCategories, activation='softmax')(p3)
    #output = Dense(nCategories, activation = 'softmax')(p1)

    model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel')

    return model
Exemplo n.º 18
0
    def _test_correctness():
        """ Tests correctness
        """
        audio_data = np.load('tests/speech_test_file.npz')['audio_data']
        sr = 44100

        hop_length = 128
        n_fft = 1024
        n_mels = 80

        # compute with librosa
        S = librosa.feature.melspectrogram(audio_data,
                                           sr=sr,
                                           n_fft=n_fft,
                                           hop_length=hop_length,
                                           n_mels=n_mels)

        S_DB_librosa = librosa.power_to_db(S, ref=np.max)

        # load precomputed
        S_expected = np.load('tests/test_audio_mel_g0.npy')

        # compute with kapre
        mels_model = tensorflow.keras.models.Sequential()
        mels_model.add(
            Melspectrogram(
                sr=sr,
                n_mels=n_mels,
                n_dft=n_fft,
                n_hop=hop_length,
                input_shape=(len(audio_data),
                             1) if image_data_format() == 'channels_last' else
                (1, len(audio_data)),
                power_melgram=2,
                return_decibel_melgram=False,
                trainable_kernel=False,
                name='melgram',
            ))

        S = mels_model.predict(
            audio_data.reshape(1, -1, 1) if image_data_format() ==
            'channels_last' else audio_data.reshape(1, 1, -1))
        if image_data_format() == 'channels_last':
            S = S[0, :, :, 0]
        else:
            S = S[0, 0]
        S_DB_kapre = librosa.power_to_db(S, ref=np.max)

        DB_scale = np.max(S_DB_librosa) - np.min(S_DB_librosa)
        S_DB_dif = np.abs(S_DB_kapre - S_DB_librosa) / DB_scale

        # compare expected float32 values with computed ones
        assert np.allclose(S_expected, S, rtol=1e-2, atol=1e-8)
        assert np.mean(S_DB_dif) < 0.01
Exemplo n.º 19
0
def model_lstm_leglaive_icassp2014(n_out,
                                   input_shape=(1, None),
                                   out_activation='softmax',
                                   bidirectional=True):
    """Singing voice detection with deep recurrent neural networks
    Simon Leglaive, Romain Hennequin, Roland Badeau, ICASSP 2015

    Symbolic summary:
    > bi_r1 - bi_r1 - bi_r1 -
    > r1 - r1 - r1 - d1

    Parameters
    ----------
        n_out: integer, number of output nodes
        input_shape: tuple, an input shape, which doesn't include batch-axis.
        out_activation: activation function on the output
        bidirectional: boolean, to specify whether rnn is bidirectional or not.

    """
    assert input_shape[0] == 1, 'Mono input please!'
    model = Sequential()
    model.add(
        Melspectrogram(sr=SR,
                       n_mels=40,
                       power_melgram=2.0,
                       return_decibel_melgram=True,
                       input_shape=input_shape))

    if K.image_data_format() == 'channels_first':
        model.add(Permute((3, 2, 1)))  # ch, freq, time -> time, freq, ch
    else:
        model.add(Permute((2, 1, 3)))  # freq, time, ch -> time, freq, ch

    model.add(BatchNormalization(axis=channel_axis))

    # Reshape for LSTM
    model.add(
        Lambda(lambda x: K.squeeze(x, axis=3),
               output_shape=squeeze_output_shape))
    if bidirectional:
        # Use Bidirectional LSTM
        model.add(Bidirectional(LSTM(30, return_sequences=True)))
        model.add(Bidirectional(LSTM(20, return_sequences=True)))
        model.add(Bidirectional(LSTM(40, return_sequences=True)))
    else:
        # Use normal LSTM
        model.add(LSTM(30 * 2, return_sequences=True))
        model.add(LSTM(20 * 2, return_sequences=True))
        model.add(LSTM(40 * 2, return_sequences=True))

    model.add(TimeDistributed(Dense(n_out, activation=out_activation)))

    return model
Exemplo n.º 20
0
 def __mel_spec_model(self, input_shape, n_mels, power_melgram, decibel_gram):
     model = Sequential()
     model.add(Melspectrogram(
         sr=self._sr,
         n_mels=n_mels,
         power_melgram=power_melgram,
         return_decibel_melgram = decibel_gram,
         input_shape=input_shape,
         trainable_fb=False
     ))
     model.add(Normalization2D(str_axis='freq'))
     return model
Exemplo n.º 21
0
def RNNSpeechModel(nCategories, samplingrate=16000, inputLength=16000):
    #simple LSTM
    sr = samplingrate
    iLen = inputLength

    inputs = Input((iLen, ))

    x = Reshape((1, -1))(inputs)

    x = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, iLen),
                       padding='same',
                       sr=sr,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')(x)

    x = Normalization2D(int_axis=0)(x)

    #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    #we would rather have it the other way around for LSTMs

    x = Permute((2, 1, 3))(x)

    x = Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)

    #x = Reshape((125, 80)) (x)
    x = Lambda(lambda q: K.squeeze(q, -1),
               name='squeeze_last_dim')(x)  #keras.backend.squeeze(x, axis)

    #x = Bidirectional(CuDNNLSTM(64, return_sequences = True)) (x) # [b_s, seq_len, vec_dim]
    #x = Bidirectional(CuDNNLSTM(64)) (x)

    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Bidirectional(LSTM(64))(x)

    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    output = Dense(nCategories, activation='softmax')(x)

    model = Model(inputs=[inputs], outputs=[output])

    return model
Exemplo n.º 22
0
def Conv1D(input_length, num_classes):
    i = layers.Input(shape=(1, input_length), name='input')
    x = Melspectrogram(n_dft=512,
                       n_hop=160,
                       padding='same',
                       sr=16000,
                       n_mels=128,
                       fmin=0.0,
                       fmax=16000 / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2, 1, 3), name='permute')(x)
    x = layers.TimeDistributed(layers.Conv1D(8,
                                             kernel_size=(4),
                                             activation='tanh'),
                               name='td_conv_1d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x)
    x = layers.TimeDistributed(layers.Conv1D(16,
                                             kernel_size=(4),
                                             activation='relu'),
                               name='td_conv_1d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x)
    x = layers.TimeDistributed(layers.Conv1D(32,
                                             kernel_size=(4),
                                             activation='relu'),
                               name='td_conv_1d_relu_2')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x)
    x = layers.TimeDistributed(layers.Conv1D(64,
                                             kernel_size=(4),
                                             activation='relu'),
                               name='td_conv_1d_relu_3')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_4')(x)
    x = layers.TimeDistributed(layers.Conv1D(128,
                                             kernel_size=(4),
                                             activation='relu'),
                               name='td_conv_1d_relu_4')(x)
    x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x)
    x = layers.Dropout(rate=0.1, name='dropout')(x)
    x = layers.Dense(64,
                     activation='relu',
                     activity_regularizer=l2(0.001),
                     name='dense')(x)

    o = layers.Dense(num_classes, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='1d_convolution')

    return model
Exemplo n.º 23
0
def model_conv1d_icassp2014_sander(n_out,
                                   input_shape=INPUT_SHAPE,
                                   out_activation='softmax'):
    """A simplified model of
    End-to-end learning for music audio,
    Sander Dieleman and Benjamin Schrauwen, ICASSP, 2014

    Symbolic summary:
    > c1 - p1 - c1 - p1 - c1 - p1 - p3 - d1

    Modifications: 
        * Add BatchNormalization
        * n_mels (128 -> 32)
        * n_layers (2 -> 3)
        * add GlobalAveragePooling2D

    Parameters
    ----------
        n_out: integer, number of output nodes
        input_shape: tuple, an input shape, which doesn't include batch-axis.
        out_activation: activation function on the output

    """

    model = Sequential()
    model.add(
        Melspectrogram(sr=SR,
                       n_mels=64,
                       power_melgram=2.0,
                       return_decibel_melgram=True,
                       input_shape=input_shape))

    model.add(Conv2D(30, (32, 4), padding='valid'))  # (None, 16, 1, N)
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((1, 4), padding='same'))

    model.add(Conv2D(30, (1, 4), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((1, 4), padding='same'))

    model.add(Conv2D(30, (1, 4), padding='same'))
    model.add(BatchNormalization(axis=channel_axis))
    model.add(Activation('relu'))
    model.add(MaxPooling2D((1, 4), padding='same'))

    model.add(GlobalAveragePooling2D())

    model.add(Dense(n_out, activation=out_activation))

    return model
Exemplo n.º 24
0
def depth_separable_cnn(input_shape=(1, 16000),
                        sr=16000,
                        loss=keras.losses.categorical_crossentropy,
                        optimizer=keras.optimizers.adam()):
    model = Sequential()
    # A mel-spectrogram layer
    model.add(
        Melspectrogram(n_dft=512,
                       n_hop=512,
                       input_shape=input_shape,
                       padding='same',
                       sr=sr,
                       n_mels=128,
                       fmin=0.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='trainable_stft'))
    # Maybe some additive white noise.
    model.add(AdditiveNoise(power=0.1))
    # If you wanna normalise it per-frequency
    model.add(Normalization2D(
        str_axis='freq'))  # or 'channel', 'time', 'batch', 'data_sample'
    # After this, it's just a usual keras workflow. For example..
    # Add some layers, e.g., model.add(some convolution layers..)
    # Compile the model
    model.add(Conv2D(64, kernel_size=(20, 8), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), dim_ordering="th"))
    model.add(Dropout(0.25))
    ## Depth Seprable Pooling Layer - start
    model.add(
        SeparableConv2D(64,
                        kernel_size=(5, 5),
                        activation='relu',
                        dim_ordering="th"))
    model.add(BatchNormalization())
    model.add(
        Conv2D(64, kernel_size=(1, 1), activation='relu', dim_ordering="th"))
    model.add(BatchNormalization())
    model.add(SeparableConv2D(64, kernel_size=(5, 5), activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(64, kernel_size=(1, 1), activation='relu'))
    model.add(BatchNormalization())
    ## Depth Seprable pooling Layer - end
    model.add(AveragePooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(12, activation='softmax'))
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model
Exemplo n.º 25
0
def train():
        pool_size = (2, 2) 
        # 350 samples
        input_shape = (channelCount, sampleCount) 
        sr = 44100
        model = Sequential()
        model.add(Melspectrogram(n_dft=512, n_hop=256, input_shape=input_shape,
                                 padding='same', sr=sr, n_mels=128,
                                 fmin=0.0, fmax=sr/2, power_melgram=1.0,
                                 return_decibel_melgram=False, trainable_fb=False,
                                 trainable_kernel=False,
                                 name='trainable_stft'))
        model.add(AdditiveNoise(power=0.2))
        model.add(Normalization2D(str_axis='freq')) # or 'channel', 'time', 'batch', 'data_sample'
        
        
        model.add(Convolution2D(32, 3, 3))
        model.add(BatchNormalization(axis=1 ))
        model.add(ELU(alpha=1.0))  
        model.add(MaxPooling2D(pool_size=pool_size))
        model.add(Dropout(0.25))        
        
        model.add(Flatten())
        model.add(Dense(128))
        model.add(Activation('relu'))
        model.add(Dropout(0.5))
        model.add(Dense(len(get_class_names())))
        model.add(Activation("softmax"))        
        
        model.compile('adam', 'categorical_crossentropy') 
        
        x,y=loadData(trainDataPath)
        
        
        
        checkpoint_filepath = 'weights.hdf5'
        print("Looking for previous weights...")
        if ( os.path.isfile(checkpoint_filepath) ):
                print ('Checkpoint file detected. Loading weights.')
                model.load_weights(checkpoint_filepath)
        else:
                print ('No checkpoint file detected.  Starting from scratch.')

        checkpointer = ModelCheckpoint(filepath=checkpoint_filepath, verbose=1, save_best_only=True)        
        test_x,test_y=loadData(testDataPath)
        
        model.fit(x, y,batch_size=128, nb_epoch=100,verbose=1,validation_data=(test_x, test_y), callbacks=[checkpointer])
        
        model.save(modelName)
Exemplo n.º 26
0
def conv1d(input_shape, sr):
    i = layers.Input(shape=input_shape, name='input')
    x = Melspectrogram(n_dft=N_DFT,
                       n_hop=HOP_LENGTH,
                       padding='same',
                       sr=sr,
                       n_mels=N_MELS,
                       fmin=0.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Permute((2, 1, 3), name='permute')(x)
    x = TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'),
                        name='td_conv_1d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x)
    x = TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x)
    x = TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_2')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x)
    x = TimeDistributed(layers.Conv1D(64, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_3')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_4')(x)
    x = TimeDistributed(layers.Conv1D(128, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_4')(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_5')(x)
    x = TimeDistributed(layers.Conv1D(256, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_5')(x)
    x = TimeDistributed(layers.Conv1D(512, kernel_size=(4), activation='relu'),
                        name='td_conv_1d_relu_6')(x)
    x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x)
    x = layers.Dropout(rate=0.2, name='dropout')(x)
    x = layers.Dense(64,
                     activation='relu',
                     activity_regularizer=l2(0.001),
                     name='dense')(x)
    o = layers.Dense(NUM_CLASSES, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='1d_convolution')
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model
Exemplo n.º 27
0
def convolution_speech_model(num_category,
                             sampling_rate=16000,
                             input_length=16000):

    inputs = layers.Input((input_length, ))

    x = layers.Reshape((1, -1))(inputs)

    x = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, input_length),
                       padding='same',
                       sr=sampling_rate,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sampling_rate / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')(x)

    x = Normalization2D(int_axis=0)(x)

    x = layers.Permute((2, 1, 3))(x)

    c1 = layers.Conv2D(20, (5, 1), activation='relu', padding='same')(x)
    c1 = layers.BatchNormalization()(c1)
    p1 = layers.MaxPooling2D((2, 1))(c1)
    p1 = layers.Dropout(0.03)(p1)

    c2 = layers.Conv2D(40, (3, 3), activation='relu', padding='same')(p1)
    c2 = layers.BatchNormalization()(c2)
    p2 = layers.MaxPooling2D((2, 2))(c2)
    p2 = layers.Dropout(0.01)(p2)

    c3 = layers.Conv2D(80, (3, 3), activation='relu', padding='same')(p2)
    c3 = layers.BatchNormalization()(c3)
    p3 = layers.MaxPooling2D((2, 2))(c3)

    p3 = layers.Flatten()(p3)
    p3 = layers.Dense(64, activation='relu')(p3)
    p3 = layers.Dense(32, activation='relu')(p3)

    output = layers.Dense(num_category, activation='softmax')(p3)

    model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel')

    return model
Exemplo n.º 28
0
    def _test_stereo_same():
        """Tests for
            - stereo input
            - same padding
            - shapes of output channel, n_freq, n_frame
            - save and load a model with it

        """
        n_ch = 2
        sr = 8000
        n_mels = 64
        fmin, fmax = 200, sr // 2
        n_dft, len_hop, nsp_src = 512, 256, 8000
        if image_data_format() == 'channels_last':
            src = np.random.uniform(-1.0, 1.0, (nsp_src, n_ch))
        else:
            src = np.random.uniform(-1.0, 1.0, (n_ch, nsp_src))

        model = tensorflow.keras.models.Sequential()
        model.add(
            Melspectrogram(
                sr=sr,
                n_mels=n_mels,
                fmin=fmin,
                fmax=fmax,
                n_dft=n_dft,
                n_hop=len_hop,
                padding='same',
                power_melgram=1.0,
                return_decibel_melgram=False,
                image_data_format='default',
                input_shape=(nsp_src, n_ch)
                if image_data_format() == 'channels_last' else (n_ch, nsp_src),
            ))

        batch_melgram_kapre = model.predict(src[np.newaxis, ...])

        if image_data_format() == 'channels_last':
            assert batch_melgram_kapre.shape[3] == n_ch
            assert batch_melgram_kapre.shape[1] == n_mels
            assert batch_melgram_kapre.shape[2] == _num_frame_same(
                nsp_src, len_hop)
        else:
            assert batch_melgram_kapre.shape[1] == n_ch
            assert batch_melgram_kapre.shape[2] == n_mels
            assert batch_melgram_kapre.shape[3] == _num_frame_same(
                nsp_src, len_hop)
Exemplo n.º 29
0
def build_kapre_model(input_shape, sr, lr, summary=False):

    model = Sequential()
    model.add(
        Melspectrogram(n_dft=256,
                       n_hop=256,
                       input_shape=input_shape,
                       padding='same',
                       sr=sr,
                       n_mels=96,
                       fmin=0.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=False,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel'))
    model.add(
        SeparableConv2D(128, (3, 3), padding='same', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 4)))
    model.add(Dropout(0.5))
    model.add(SeparableConv2D(384, (3, 3), padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(4, 5)))
    model.add(Dropout(0.5))
    model.add(SeparableConv2D(768, (3, 3), padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(3, 8)))
    model.add(Dropout(0.5))
    model.add(SeparableConv2D(2048, (3, 3), padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(4, 8)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(3, activation='linear'))
    model.compile(loss=losses.mean_squared_error,
                  optimizer=optimizers.Adam(lr=lr))

    if summary:
        model.summary()

    return model
def test_plot():
    SR = 16000
    src = np.random.random((1, SR * 3))
    src_cute, _ = librosa.load(
        '/Users/admin/Dropbox/workspace/unet/data/audio/abjones_1_01.wav',
        sr=SR,
        mono=True)
    model = Sequential()
    model.add(
        Melspectrogram(sr=SR,
                       n_mels=128,
                       n_dft=512,
                       n_hop=256,
                       input_shape=src.shape,
                       return_decibel_melgram=True,
                       trainable_kernel=True,
                       name='melgram'))

    check_model(model)
    visualise_model(model)

    SR = 16000
    src = np.random.random((1, SR * 3))
    model = Sequential()
    model.add(
        Spectrogram(n_dft=512,
                    n_hop=256,
                    input_shape=src.shape,
                    return_decibel_spectrogram=False,
                    power_spectrogram=2.0,
                    trainable_kernel=False,
                    name='static_stft'))

    check_model(model)
    plt.figure(figsize=(14, 4))
    plt.subplot(1, 2, 1)
    plt.title('log-Spectrogram by Kapre')
    visualise_model(model, logam=True)
    plt.subplot(1, 2, 2)
    display.specshow(librosa.amplitude_to_db(np.abs(
        librosa.stft(src_cute[:SR * 3], 512, 256))**2,
                                             ref=1.0),
                     y_axis='linear',
                     sr=SR)
    plt.title('log-Spectrogram by Librosa')
    plt.show()