Пример #1
0
def AttRNNSpeechModel(nCategories, samplingrate=16000,
                      inputLength=16000, rnn_func=L.LSTM):
    # simple LSTM
    sr = samplingrate
    iLen = inputLength

    inputs = L.Input((inputLength,), name='input')

    x = L.Reshape((1, -1))(inputs)

    m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen),
                       padding='same', sr=sr, n_mels=80,
                       fmin=40.0, fmax=sr / 2, power_melgram=1.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')
    m.trainable = False

    x = m(x)

    x = Normalization2D(int_axis=0, name='mel_stft_norm')(x)

    # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    # we would rather have it the other way around for LSTMs

    x = L.Permute((2, 1, 3))(x)

    x = L.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)

    # x = Reshape((125, 80)) (x)
    # keras.backend.squeeze(x, axis)
    x = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x)

    x = L.Bidirectional(rnn_func(64, return_sequences=True)
                        )(x)  # [b_s, seq_len, vec_dim]
    x = L.Bidirectional(rnn_func(64, return_sequences=True)
                        )(x)  # [b_s, seq_len, vec_dim]

    xFirst = L.Lambda(lambda q: q[:, -1])(x)  # [b_s, vec_dim]
    query = L.Dense(128)(xFirst)

    # dot product attention
    attScores = L.Dot(axes=[1, 2])([query, x])
    attScores = L.Softmax(name='attSoftmax')(attScores)  # [b_s, seq_len]

    # rescale sequence
    attVector = L.Dot(axes=[1, 1])([attScores, x])  # [b_s, vec_dim]

    x = L.Dense(64, activation='relu')(attVector)
    x = L.Dense(32)(x)

    output = L.Dense(nCategories, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    return model
def attention_speech_model(num_category,
                           sampling_rate=16000,
                           input_length=16000):

    inputs = layers.Input((input_length, ), name='input')
    x = layers.Reshape((1, -1))(inputs)

    m = Melspectrogram(input_shape=(1, input_length),
                       n_dft=1024,
                       n_hop=128,
                       padding='same',
                       sr=sampling_rate,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sampling_rate / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_tft')
    m.trainable = False
    x = m(x)

    x = Normalization2D(int_axis=0, name='norm')(x)
    x = layers.Permute((2, 1, 3))(x)

    x = layers.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = layers.LeakyReLU()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)

    x = layers.Lambda(lambda t: K.squeeze(t, -1), name='squeeze_last_dim')(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)

    x_first = layers.Lambda(lambda t: t[:, t.shape[1] // 2])(x)
    query = layers.Dense(128)(x_first)

    attention_scores = layers.Dot([1, 2])([query, x])
    attention_scores = layers.Softmax(
        name='attention_softmax')(attention_scores)
    attention_vector = layers.Dot(axes=[1, 1])([attention_scores, x])

    x = layers.Dense(64)(attention_vector)
    x = layers.LeakyReLU()(x)
    x = layers.Dropout(0.5)(x)

    x = layers.Dense(32)(x)
    x = layers.Dropout(0.5)(x)

    out = layers.Dense(num_category, activation='softmax', name="output")(x)
    model = Model(inputs=inputs, outputs=out)
    return model
Пример #3
0
def Att_RNN_Speech(x_train, y_train, classes, sampling_rate=16000, input_length=16000, batch_size=32, epochs=3):

  inputs = Input((input_length,))

  x = Reshape((1, -1))(inputs)

  m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, input_length),
                      padding='same', sr=sampling_rate, n_mels=80,
                      fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0,
                      return_decibel_melgram=True, trainable_fb=False,
                      trainable_kernel=False)
  m.trainable = False

  x = m(x)

  x = Normalization2D(int_axis=0)(x)

  x = Permute((2, 1, 3))(x)

  x = Conv2D(10, (5, 1), activation='relu', padding='same')(x)
  x = BatchNormalization()(x)
  x = Conv2D(1, (5, 1), activation='relu', padding='same')(x)
  x = BatchNormalization()(x)

  x = Lambda(lambda q: squeeze(q, -1))(x)

  x = Bidirectional(LSTM(64, return_sequences=True))(x)
  x = Bidirectional(LSTM(64, return_sequences=True))(x)

  xFirst = Lambda(lambda q: q[:, -1])(x)
  query = Dense(128)(xFirst)

  attScores = Dot(axes=[1, 2])([query, x])
  attScores = Softmax()(attScores)

  attVector = Dot(axes=[1, 1])([attScores, x])

  x = Dense(64, activation='relu')(attVector)
  x = Dense(32)(x)

  output = Dense(classes, activation='softmax')(x)

  model = Model(inputs=[inputs], outputs=[output])
  model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy'])
  model.summary()
  model.fit(x_train, validation_data=y_train, epochs=epochs, batch_size=batch_size,  use_multiprocessing=False, workers=4, verbose=2)
  model.save('Att_RNN_Speech.model')
def Build_MelSpectrogram(Parametres_layer, input_length):

    mel_layer = Melspectrogram(n_dft=Parametres_layer["n_dft"],
                               n_hop=Parametres_layer["n_hop"],
                               input_shape=(1, input_length),
                               padding=Parametres_layer["padding"],
                               sr=Parametres_layer["sr"],
                               n_mels=Parametres_layer["n_mels"],
                               fmin=40.0,
                               fmax=Parametres_layer["sr"] / 2,
                               power_melgram=1.0,
                               return_decibel_melgram=True,
                               trainable_fb=False,
                               trainable_kernel=False,
                               name='mel_stft')
    mel_layer.trainable = False

    return mel_layer
Пример #5
0
def attRNN():
    sr = 8000
    inputs = Input((8000, 1), name='input')

    x = Reshape((1, -1))(inputs)

    m = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       input_shape=(1, 8000),
                       padding='same',
                       sr=sr,
                       n_mels=80,
                       fmin=40.0,
                       fmax=sr / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')
    m.trainable = False

    x = m(x)

    x = Normalization2D(int_axis=0, name='mel_stft_norm')(x)

    # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    # we would rather have it the other way around for LSTMs

    x = Permute((2, 1, 3))(x)

    x = Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)

    # x = Reshape((125, 80)) (x)
    # keras.backend.squeeze(x, axis)
    x = Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x)

    x = Bidirectional(LSTM(64, return_sequences=True))(
        x)  # [b_s, seq_len, vec_dim]

    x = Bidirectional(LSTM(64, return_sequences=True))(
        x)  # [b_s, seq_len, vec_dim]

    xFirst = Lambda(lambda q: q[:, -1])(x)  # [b_s, vec_dim]
    query = Dense(128)(xFirst)

    # dot product attention
    attScores = Dot(axes=[1, 2])([query, x])
    attScores = Softmax(name='attSoftmax')(attScores)  # [b_s, seq_len]

    # rescale sequence
    attVector = Dot(axes=[1, 1])([attScores, x])  # [b_s, vec_dim]

    x = Dense(64, activation='relu')(attVector)
    x = Dense(32)(x)

    output = Dense(9, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])

    model.compile(optimizer='adam',
                  loss=['sparse_categorical_crossentropy'],
                  metrics=['sparse_categorical_accuracy'])
    model.summary()

    return model
Пример #6
0
def RNN_model(N_CLASSES=2, SR=16000, DT=2.0):

    rnn_func = L.LSTM

    inputs = L.Input(shape=(1, int(SR * DT)), name='input')

    x = L.Reshape((1, -1))(inputs)

    m = Melspectrogram(n_dft=1024,
                       n_hop=128,
                       padding='same',
                       sr=SR,
                       n_mels=80,
                       fmin=40,
                       fmax=SR / 2,
                       power_melgram=1.0,
                       return_decibel_melgram=True,
                       trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')

    m.trainable = False

    x = m(x)

    x = Normalization2D(int_axis=0, name='mel_stft_norm')(x)

    x = L.Permute((2, 1, 3))(x)

    x = L.Conv2D(10, (5, 1), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)
    x = L.Conv2D(1, (5, 1), activation='relu', padding='same')(x)
    x = L.BatchNormalization()(x)

    x = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x)

    x = L.Bidirectional(rnn_func(64, return_sequences=True))(x)

    x = L.Bidirectional(rnn_func(64, return_sequences=True))(x)

    xFirst = L.Lambda(lambda q: q[:, -1])(x)
    query = L.Dense(128)(xFirst)

    attScores = L.Dot(axes=[1, 2])([query, x])
    attScores = L.Softmax(name='attSoftmax')(attScores)

    attVector = L.Dot(axes=[1, 1])([attScores, x])

    x = L.Dense(64, activation='relu')(attVector)
    x = L.Dense(32)(x)

    output = L.Dense(N_CLASSES, activation='softmax', name='output')(x)

    model = Model(inputs=[inputs], outputs=[output])
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    model.summary()

    # model.fit(x_train,y_train,batch_size=32,epochs=5
    #             #callbacks=[earlystopper, checkpointer, lrate]
    #             )
    # from keras.models import load_model
    # model.save('test_model_RNN.h5')

    ##### OLD Version RNN Model  #####

    # i = L.Input(shape=(1,int(SR*DT)), name='input')
    # x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=SR, n_mels=128, fmin=0.0,
    #                     fmax=SR/2, power_melgram=1.0, return_decibel_melgram=True,
    #                     trainable_fb=False, trainable_kernel=False,name='melbands')(i)
    # x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    # x = L.Permute((2,1,3), name='permute')(x)
    # x = TimeDistributed(L.Reshape((-1,)), name='reshape')(x)
    # s = TimeDistributed(L.Dense(64, activation='tanh'), name='td_dense_tanh')(x)
    # x = L.Bidirectional(L.LSTM(32, return_sequences=True), name='bidirectional_lstm')(s)
    # x = L.concatenate([s, x], axis=2, name='skip_connection')
    # x = L.Dense(64, activation='relu', name='dense_1_relu')(x)
    # x = L.MaxPooling1D(name='max_pool_1d')(x)
    # x = L.Dense(32, activation='relu', name='dense_2_relu')(x)
    # x = L.Flatten(name='flatten')(x)
    # x = L.Dropout(rate=0.2, name='dropout')(x)
    # x = L.Dense(32, activation='relu', activity_regularizer=l2(0.001),name='dense_3_relu')(x)
    # o = L.Dense(N_CLASSES, activation='softmax', name='softmax')(x)

    # model = Model(inputs=i, outputs=o, name='long_short_term_memory')
    # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model