def createMelSpectrogram(input_path, fileName, output_path, saveOrShow=0): # load sound signal signal, sr = librosa.load(os.path.join(input_path, fileName), duration=10, sr=16000) #signal = filter_signal(signal, sr, target_audio_length) # create Mel Spectrogram S = Melspectrogram(n_dft=1024, n_hop=320, #n_hop=256, input_shape=(1, signal.shape[0]), padding='same', sr=sr, n_mels=224, fmin=1400, fmax=sr/2, power_melgram=2.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False)(signal.reshape(1, 1, -1)).numpy() S = S.reshape(S.shape[1], S.shape[2]) print(S.shape) if saveOrShow == 0: matplotlib.image.imsave(os.path.join(output_path, fileName.split(".")[0] + ".png"), S, cmap='inferno') else: #plt.imshow(S) #plt.show() display.specshow(S, sr=sr) plt.show()
def AttRNNSpeechModel(nCategories, samplingrate=16000, inputLength=16000, rnn_func=L.LSTM): # simple LSTM sr = samplingrate iLen = inputLength inputs = L.Input((inputLength,), name='input') x = L.Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen), padding='same', sr=sr, n_mels=80, fmin=40.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='mel_stft_norm')(x) # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) # we would rather have it the other way around for LSTMs x = L.Permute((2, 1, 3))(x) x = L.Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = L.BatchNormalization()(x) x = L.Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = L.BatchNormalization()(x) # x = Reshape((125, 80)) (x) # keras.backend.squeeze(x, axis) x = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x) x = L.Bidirectional(rnn_func(64, return_sequences=True) )(x) # [b_s, seq_len, vec_dim] x = L.Bidirectional(rnn_func(64, return_sequences=True) )(x) # [b_s, seq_len, vec_dim] xFirst = L.Lambda(lambda q: q[:, -1])(x) # [b_s, vec_dim] query = L.Dense(128)(xFirst) # dot product attention attScores = L.Dot(axes=[1, 2])([query, x]) attScores = L.Softmax(name='attSoftmax')(attScores) # [b_s, seq_len] # rescale sequence attVector = L.Dot(axes=[1, 1])([attScores, x]) # [b_s, vec_dim] x = L.Dense(64, activation='relu')(attVector) x = L.Dense(32)(x) output = L.Dense(nCategories, activation='softmax', name='output')(x) model = Model(inputs=[inputs], outputs=[output]) return model
def build_model_vggish(classes, dropout_final=0.2, shape=(None, 320000), sr=16000, rnn_type='gru', rnn_units=256, focal_alpha=0.95, rnn_layers=1, rnn_dropout=0.2, activation='elu', random_noise=0.2, weights='soundnet'): inputs = keras.Input(shape=shape[1:]) x = keras.layers.Reshape(target_shape=(1, -1))(inputs) x = Melspectrogram(n_dft=512, n_hop=256, padding='same', sr=sr, n_mels=64, fmin=125, fmax=7500, power_melgram=1.0, return_decibel_melgram=True, name='trainable_stft')(x) if random_noise: x = AdditiveNoise(power=random_noise, random_gain=True)(x) x = Normalization2D(str_axis='freq')(x) x = Lambda(lambda x: K.permute_dimensions(x=x, pattern=(0, 2, 1, 3)), name="transpose")(x) vggish = VGGish(include_top=False, load_weights=weights, input_shape=x.get_shape().as_list()[1:], pooling=None) if weights is not None: # only freeze when using pretrained layers for layer in vggish.layers: layer.trainable = False x = vggish(x) x = keras.layers.AveragePooling2D(pool_size=(1, 4))(x) x = keras.layers.Reshape(target_shape=(-1, 512))(x) outputs = rnn_classifier_branch(x, name='rnn', dropout=rnn_dropout, dropout_final=dropout_final, rnn_units=rnn_units, rnn_type=rnn_type, n_classes=len(classes), rnn_layers=rnn_layers) model = keras.Model(inputs=inputs, outputs=outputs, name='crnn') model.summary() return model, vggish
def attention_speech_model(num_category, sampling_rate=16000, input_length=16000): inputs = layers.Input((input_length, ), name='input') x = layers.Reshape((1, -1))(inputs) m = Melspectrogram(input_shape=(1, input_length), n_dft=1024, n_hop=128, padding='same', sr=sampling_rate, n_mels=80, fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_tft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='norm')(x) x = layers.Permute((2, 1, 3))(x) x = layers.Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = layers.LeakyReLU()(x) x = layers.BatchNormalization()(x) x = layers.Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = layers.BatchNormalization()(x) x = layers.Lambda(lambda t: K.squeeze(t, -1), name='squeeze_last_dim')(x) x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) x_first = layers.Lambda(lambda t: t[:, t.shape[1] // 2])(x) query = layers.Dense(128)(x_first) attention_scores = layers.Dot([1, 2])([query, x]) attention_scores = layers.Softmax( name='attention_softmax')(attention_scores) attention_vector = layers.Dot(axes=[1, 1])([attention_scores, x]) x = layers.Dense(64)(attention_vector) x = layers.LeakyReLU()(x) x = layers.Dropout(0.5)(x) x = layers.Dense(32)(x) x = layers.Dropout(0.5)(x) out = layers.Dense(num_category, activation='softmax', name="output")(x) model = Model(inputs=inputs, outputs=out) return model
def Conv1D(N_CLASSES=3, SR=5000, DT=0.25): i = layers.Input(shape=(1, int(SR*DT)), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=SR, n_mels=128, fmin=0.0, fmax=SR/2, power_melgram=2.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2,1,3), name='permute')(x) x = TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2,2), name='max_pool_2d_1')(x) x = TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2,2), name='max_pool_2d_2')(x) x = TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x) x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x) x = layers.Dropout(rate=0.1, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='1d_convolution') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def deep_net_i(self, feature_count): # Create Model model = Sequential() model.add( Melspectrogram(sr=SR, n_mels=128, power_melgram=1.0, input_shape=(1, feature_count), trainable_fb=False, fmin=800, fmax=8000)) model.add(Convolution2D(32, 9, 9, name='conv1', activation='relu')) model.add(MaxPooling2D((25, 17))) model.add(Flatten()) model.add(Dense(32, activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(32, activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(32, kernel_initializer='normal', activation='softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def LSTM(N_CLASSES=10, SR=16000, DT=1.0): i = layers.Input(shape=(1, int(SR*DT)), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=SR, n_mels=128, fmin=0.0, fmax=SR/2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2,1,3), name='permute')(x) x = TimeDistributed(layers.Reshape((-1,)), name='reshape')(x) s = TimeDistributed(layers.Dense(64, activation='tanh'), name='td_dense_tanh')(x) x = layers.Bidirectional(layers.LSTM(32, return_sequences=True), name='bidirectional_lstm')(s) x = layers.concatenate([s, x], axis=2, name='skip_connection') x = layers.Dense(64, activation='relu', name='dense_1_relu')(x) x = layers.MaxPooling1D(name='max_pool_1d')(x) x = layers.Dense(32, activation='relu', name='dense_2_relu')(x) x = layers.Flatten(name='flatten')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(32, activation='relu', activity_regularizer=l2(0.001), name='dense_3_relu')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='long_short_term_memory') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def raw_vgg(args, input_length=12000 * 29, tf='melgram', normalize=None, decibel=False, last_layer=True, sr=None): ''' when length = 12000*29 and 512/256 dft/hop, melgram size: (n_mels, 1360) ''' assert tf in ('stft', 'melgram') assert normalize in (None, False, 'no', 0, 0.0, 'batch', 'data_sample', 'time', 'freq', 'channel') assert isinstance(decibel, bool) if sr is None: sr = SR # assumes 12000 conv_until = args.conv_until trainable_kernel = args.trainable_kernel model = Sequential() # decode args fmin = args.fmin fmax = args.fmax if fmax == 0.0: fmax = sr / 2 n_mels = args.n_mels trainable_fb = args.trainable_fb model.add( Melspectrogram(n_dft=512, n_hop=256, power_melgram=2.0, input_shape=(1, input_length), trainable_kernel=trainable_kernel, trainable_fb=trainable_fb, return_decibel_melgram=decibel, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax, name='melgram')) poolings = [(2, 4), (3, 4), (2, 5), (2, 4), (4, 4)] if normalize in ('batch', 'data_sample', 'time', 'freq', 'channel'): model.add(Normalization2D(normalize)) model.add( get_convBNeluMPdrop(5, [32, 32, 32, 32, 32], [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3)], poolings, model.output_shape[1:], conv_until=conv_until)) if conv_until != 4: model.add(GlobalAveragePooling2D()) else: model.add(Flatten()) if last_layer: model.add(Dense(1, activation='linear')) return model
def Conv2D(N_CLASSES=10, SR=16000, DT=1.0): i = layers.Input(shape=(1, int(SR*DT)), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=SR, n_mels=128, fmin=0.0, fmax=SR/2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Conv2D(8, kernel_size=(7,7), activation='tanh', padding='same', name='conv2d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_1')(x) x = layers.Conv2D(16, kernel_size=(5,5), activation='relu', padding='same', name='conv2d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_2')(x) x = layers.Conv2D(16, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_2')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_3')(x) x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_3')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_4')(x) x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_4')(x) x = layers.Flatten(name='flatten')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='2d_convolution') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def Att_RNN_Speech(x_train, y_train, classes, sampling_rate=16000, input_length=16000, batch_size=32, epochs=3): inputs = Input((input_length,)) x = Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, input_length), padding='same', sr=sampling_rate, n_mels=80, fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False) m.trainable = False x = m(x) x = Normalization2D(int_axis=0)(x) x = Permute((2, 1, 3))(x) x = Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Lambda(lambda q: squeeze(q, -1))(x) x = Bidirectional(LSTM(64, return_sequences=True))(x) x = Bidirectional(LSTM(64, return_sequences=True))(x) xFirst = Lambda(lambda q: q[:, -1])(x) query = Dense(128)(xFirst) attScores = Dot(axes=[1, 2])([query, x]) attScores = Softmax()(attScores) attVector = Dot(axes=[1, 1])([attScores, x]) x = Dense(64, activation='relu')(attVector) x = Dense(32)(x) output = Dense(classes, activation='softmax')(x) model = Model(inputs=[inputs], outputs=[output]) model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy']) model.summary() model.fit(x_train, validation_data=y_train, epochs=epochs, batch_size=batch_size, use_multiprocessing=False, workers=4, verbose=2) model.save('Att_RNN_Speech.model')
def __init__(self, n_hop): i = Input(shape=(1, L)) mel = Melspectrogram(sr=L, n_mels=128, n_dft=2048, n_hop=n_hop, power_melgram=2.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, input_shape=(1, L))(i) norm_i = BatchNormalization()(mel) conv1 = Conv2D(filters=16, kernel_size=(3, 3), padding='same')(norm_i) bn1 = BatchNormalization()(conv1) relu1 = Activation(activation='relu')(bn1) res1 = self.resblock(z=relu1, n_in=16, n_out=16) pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(res1) drp1 = Dropout(rate=0.1)(pool1) conv2 = Conv2D(filters=32, kernel_size=(3, 3), padding='same')(drp1) bn2 = BatchNormalization()(conv2) relu2 = Activation(activation='relu')(bn2) res2 = self.resblock(z=relu2, n_in=32, n_out=32) res3 = self.resblock(z=res2, n_in=32, n_out=32) pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(res3) drp2 = Dropout(rate=0.2)(pool2) conv3 = Conv2D(filters=64, kernel_size=(3, 3), padding='same')(drp2) bn3 = BatchNormalization()(conv3) relu3 = Activation(activation='relu')(bn3) res4 = self.resblock(z=relu3, n_in=64, n_out=64) res5 = self.resblock(z=res4, n_in=64, n_out=64) pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(res5) drp3 = Dropout(rate=0.2)(pool3) conv4 = Conv2D(filters=128, kernel_size=(3, 3), padding='same')(drp3) bn4 = BatchNormalization()(conv4) relu4 = Activation(activation='relu')(bn4) res6 = self.resblock(z=relu4, n_in=128, n_out=128) res7 = self.resblock(z=res6, n_in=128, n_out=128) drp4 = Dropout(rate=0.2)(res7) conv5 = Conv2D(filters=256, kernel_size=(3, 3), padding='same')(drp4) bn5 = BatchNormalization()(conv5) relu5 = Activation(activation='relu')(bn5) pool5 = GlobalAveragePooling2D()(relu5) dense1 = Dense(units=256, activation=relu)(pool5) drp5 = Dropout(rate=0.2)(dense1) out = Dense(units=N_CLASS, activation=softmax)(drp5) model = Model(inputs=[i], outputs=out) opt = optimizers.Adam() model.compile(optimizer=opt, loss=losses.binary_crossentropy, metrics=[categorical_accuracy]) self.model = model
def model_mfcc_layer(x_train, num_labels): sr = 22050 model_input = x = Input(shape=x_train[0].shape) x = Melspectrogram(n_dft=512, n_hop=sr // 128 + 1, padding='same', sr=sr, n_mels=128, fmin=0.0, fmax=sr / 2, power_melgram=2.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='trainable_stft')(x) # x = Spectrogram(n_dft=512, n_hop=sr // 128 + 1, # return_decibel_spectrogram=False, power_spectrogram=2.0, # trainable_kernel=False, name='static_stft')(x) # x = Normalization2D(str_axis='freq')(x) # x = AdditiveNoise(power=0.3)(x) x = Conv2D(filters=16, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = MaxPooling2D(pool_size=2)(x) x = Conv2D(filters=32, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = MaxPooling2D(pool_size=2)(x) x = Conv2D(filters=64, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = MaxPooling2D(pool_size=2)(x) x = Conv2D(filters=128, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = MaxPooling2D(pool_size=2)(x) x = Conv2D(filters=256, kernel_size=filter_size, padding='same')(x) x = BatchNormalization()(x) x = activation()(x) x = AveragePooling2D(pool_size=(int(x.get_shape()[1]), int(x.get_shape()[2])))(x) x = Conv2D(filters=num_labels, kernel_size=1, padding='valid', activation='softmax' if num_labels > 1 else 'relu')(x) model = Model(inputs=[model_input], outputs=[x]) model.summary() return model
def Build_MelSpectrogram(Parametres_layer, input_length): mel_layer = Melspectrogram(n_dft=Parametres_layer["n_dft"], n_hop=Parametres_layer["n_hop"], input_shape=(1, input_length), padding=Parametres_layer["padding"], sr=Parametres_layer["sr"], n_mels=Parametres_layer["n_mels"], fmin=40.0, fmax=Parametres_layer["sr"] / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft') mel_layer.trainable = False return mel_layer
def model_conv3x3_ismir2016_choi(n_out, input_shape=INPUT_SHAPE, out_activation='softmax'): """ A simplified model of Automatic Tagging Using Deep Convolutional Neural Networks, K Choi, G Fazekas, M Sandler, ISMIR, 2016, New York, USA Symbolic summary: > c2 - p2 - c2 - p2 - c2 - p2 - c2 - p2 - c2 - p3 - d1 Modifications: * n_mels (96 -> 32) * n_channels (many -> [16, 24, 32, 40, 48]) * remove dropout * maxpooling (irregular to fit the size -> all (2, 2)) * add GlobalAveragePooling2D """ model = Sequential() model.add( Melspectrogram(sr=SR, n_mels=64, power_melgram=2.0, return_decibel_melgram=True, input_shape=input_shape)) model.add(BatchNormalization(axis=channel_axis)) model.add(Conv2D(10, (3, 3), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(MaxPooling2D((2, 2), padding='same')) model.add(Conv2D(15, (3, 3), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(MaxPooling2D((2, 2), padding='same')) model.add(Conv2D(15, (3, 3), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(MaxPooling2D((2, 2), padding='same')) model.add(Conv2D(20, (3, 3), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(MaxPooling2D((2, 2), padding='same')) model.add(Conv2D(20, (3, 3), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(MaxPooling2D((2, 2), padding='same')) model.add(GlobalAveragePooling2D()) model.add(Dense(n_out, activation=out_activation)) return model
def model_convrnn(n_out, input_shape=(1, None), out_activation='softmax'): """No reference, just ConvRNN. Symbolic summary: > c2 - c2 - c2 - c2 - r2 - r2 - d1 Parameters ---------- n_out: integer, number of output nodes input_shape: tuple, an input shape, which doesn't include batch-axis. (1, None) means (mono channel, variable length). out_activation: activation function on the output """ assert input_shape[0] == 1, 'Mono input please!' model = Sequential() n_mels = 64 model.add( Melspectrogram(sr=SR, n_mels=n_mels, power_melgram=2.0, return_decibel_melgram=True, input_shape=input_shape)) model.add(Conv2D(32, (3, 3), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(Conv2D(32, (3, 3), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(Conv2D(16, (3, 3), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(Conv2D(1, (1, 1), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) if K.image_dim_ordering() == 'channels_first': # (ch, freq, time) model.add(Permute((3, 2, 1))) # (time, freq, ch) else: # (freq, time, ch) model.add(Permute((2, 1, 3))) # (time, ch, freq) # model.add(Reshape((-1, n_mels * n_ch))) # (time, ch * freq) # Reshape for LSTM model.add( Lambda(lambda x: K.squeeze(x, axis=3), output_shape=squeeze_output_shape)) model.add(LSTM(25, return_sequences=True)) model.add(LSTM(25, return_sequences=True)) model.add(TimeDistributed(Dense(n_out, activation=out_activation))) return model
def model_multi_kernel_shape(n_out, input_shape=INPUT_SHAPE, out_activation='softmax'): """ Symbolic summary: > c2' - p2 - c2 - p2 - c2 - p2 - c2 - p3 - d1 where c2' -> multiple kernel shapes Parameters ---------- n_out: integer, number of output nodes input_shape: tuple, an input shape, which doesn't include batch-axis. out_activation: activation function on the output """ audio_input = Input(shape=input_shape) x = Melspectrogram(sr=SR, n_mels=64, power_melgram=2.0, return_decibel_melgram=True)(audio_input) x = BatchNormalization(axis=channel_axis)(x) x1 = Conv2D(7, (20, 3), padding='same')(x) x2 = Conv2D(7, (3, 3), padding='same')(x) x3 = Conv2D(7, (3, 20), padding='same')(x) x = Concatenate(axis=channel_axis)([x1, x2, x3]) x = BatchNormalization(axis=channel_axis)(x) x = Activation('relu')(x) x = MaxPooling2D((2, 2), padding='same')(x) x = Conv2D(21, (3, 3), padding='same')(x) x = BatchNormalization(axis=channel_axis)(x) x = Activation('relu')(x) x = MaxPooling2D((2, 2), padding='same')(x) x = Conv2D(21, (3, 3), padding='same')(x) x = BatchNormalization(axis=channel_axis)(x) x = Activation('relu')(x) x = MaxPooling2D((2, 2), padding='same')(x) x = Conv2D(21, (3, 3), padding='same')(x) x = BatchNormalization(axis=channel_axis)(x) x = Activation('relu')(x) x = MaxPooling2D((4, 4), padding='same')(x) x = GlobalAveragePooling2D()(x) out = Dense(n_out, activation=out_activation)(x) model = Model(audio_input, out) return model
def ConvSpeechModel(nCategories, samplingrate=16000, inputLength=16000): """ Base fully convolutional model for speech recognition """ inputs = Input((inputLength, )) x = Reshape((1, -1))(inputs) x = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, inputLength), padding='same', sr=samplingrate, n_mels=80, fmin=40.0, fmax=samplingrate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft')(x) x = Normalization2D(int_axis=0)(x) #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) #we would rather have it the other way around for LSTMs x = Permute((2, 1, 3))(x) #x = Reshape((94,80)) (x) #this is strange - but now we have (batch_size, sequence, vec_dim) c1 = Conv2D(20, (5, 1), activation='relu', padding='same')(x) c1 = BatchNormalization()(c1) p1 = MaxPooling2D((2, 1))(c1) p1 = Dropout(0.03)(p1) c2 = Conv2D(40, (3, 3), activation='relu', padding='same')(p1) c2 = BatchNormalization()(c2) p2 = MaxPooling2D((2, 2))(c2) p2 = Dropout(0.01)(p2) c3 = Conv2D(80, (3, 3), activation='relu', padding='same')(p2) c3 = BatchNormalization()(c3) p3 = MaxPooling2D((2, 2))(c3) p3 = Flatten()(p3) p3 = Dense(64, activation='relu')(p3) p3 = Dense(32, activation='relu')(p3) output = Dense(nCategories, activation='softmax')(p3) #output = Dense(nCategories, activation = 'softmax')(p1) model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel') return model
def _test_correctness(): """ Tests correctness """ audio_data = np.load('tests/speech_test_file.npz')['audio_data'] sr = 44100 hop_length = 128 n_fft = 1024 n_mels = 80 # compute with librosa S = librosa.feature.melspectrogram(audio_data, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels) S_DB_librosa = librosa.power_to_db(S, ref=np.max) # load precomputed S_expected = np.load('tests/test_audio_mel_g0.npy') # compute with kapre mels_model = tensorflow.keras.models.Sequential() mels_model.add( Melspectrogram( sr=sr, n_mels=n_mels, n_dft=n_fft, n_hop=hop_length, input_shape=(len(audio_data), 1) if image_data_format() == 'channels_last' else (1, len(audio_data)), power_melgram=2, return_decibel_melgram=False, trainable_kernel=False, name='melgram', )) S = mels_model.predict( audio_data.reshape(1, -1, 1) if image_data_format() == 'channels_last' else audio_data.reshape(1, 1, -1)) if image_data_format() == 'channels_last': S = S[0, :, :, 0] else: S = S[0, 0] S_DB_kapre = librosa.power_to_db(S, ref=np.max) DB_scale = np.max(S_DB_librosa) - np.min(S_DB_librosa) S_DB_dif = np.abs(S_DB_kapre - S_DB_librosa) / DB_scale # compare expected float32 values with computed ones assert np.allclose(S_expected, S, rtol=1e-2, atol=1e-8) assert np.mean(S_DB_dif) < 0.01
def model_lstm_leglaive_icassp2014(n_out, input_shape=(1, None), out_activation='softmax', bidirectional=True): """Singing voice detection with deep recurrent neural networks Simon Leglaive, Romain Hennequin, Roland Badeau, ICASSP 2015 Symbolic summary: > bi_r1 - bi_r1 - bi_r1 - > r1 - r1 - r1 - d1 Parameters ---------- n_out: integer, number of output nodes input_shape: tuple, an input shape, which doesn't include batch-axis. out_activation: activation function on the output bidirectional: boolean, to specify whether rnn is bidirectional or not. """ assert input_shape[0] == 1, 'Mono input please!' model = Sequential() model.add( Melspectrogram(sr=SR, n_mels=40, power_melgram=2.0, return_decibel_melgram=True, input_shape=input_shape)) if K.image_data_format() == 'channels_first': model.add(Permute((3, 2, 1))) # ch, freq, time -> time, freq, ch else: model.add(Permute((2, 1, 3))) # freq, time, ch -> time, freq, ch model.add(BatchNormalization(axis=channel_axis)) # Reshape for LSTM model.add( Lambda(lambda x: K.squeeze(x, axis=3), output_shape=squeeze_output_shape)) if bidirectional: # Use Bidirectional LSTM model.add(Bidirectional(LSTM(30, return_sequences=True))) model.add(Bidirectional(LSTM(20, return_sequences=True))) model.add(Bidirectional(LSTM(40, return_sequences=True))) else: # Use normal LSTM model.add(LSTM(30 * 2, return_sequences=True)) model.add(LSTM(20 * 2, return_sequences=True)) model.add(LSTM(40 * 2, return_sequences=True)) model.add(TimeDistributed(Dense(n_out, activation=out_activation))) return model
def __mel_spec_model(self, input_shape, n_mels, power_melgram, decibel_gram): model = Sequential() model.add(Melspectrogram( sr=self._sr, n_mels=n_mels, power_melgram=power_melgram, return_decibel_melgram = decibel_gram, input_shape=input_shape, trainable_fb=False )) model.add(Normalization2D(str_axis='freq')) return model
def RNNSpeechModel(nCategories, samplingrate=16000, inputLength=16000): #simple LSTM sr = samplingrate iLen = inputLength inputs = Input((iLen, )) x = Reshape((1, -1))(inputs) x = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen), padding='same', sr=sr, n_mels=80, fmin=40.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft')(x) x = Normalization2D(int_axis=0)(x) #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) #we would rather have it the other way around for LSTMs x = Permute((2, 1, 3))(x) x = Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) #x = Reshape((125, 80)) (x) x = Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x) #keras.backend.squeeze(x, axis) #x = Bidirectional(CuDNNLSTM(64, return_sequences = True)) (x) # [b_s, seq_len, vec_dim] #x = Bidirectional(CuDNNLSTM(64)) (x) x = Bidirectional(LSTM(64, return_sequences=True))(x) x = Bidirectional(LSTM(64))(x) x = Dense(64, activation='relu')(x) x = Dense(32, activation='relu')(x) output = Dense(nCategories, activation='softmax')(x) model = Model(inputs=[inputs], outputs=[output]) return model
def Conv1D(input_length, num_classes): i = layers.Input(shape=(1, input_length), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=16000, n_mels=128, fmin=0.0, fmax=16000 / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2, 1, 3), name='permute')(x) x = layers.TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x) x = layers.TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x) x = layers.TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x) x = layers.TimeDistributed(layers.Conv1D(64, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_3')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_4')(x) x = layers.TimeDistributed(layers.Conv1D(128, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_4')(x) x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x) x = layers.Dropout(rate=0.1, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(num_classes, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='1d_convolution') return model
def model_conv1d_icassp2014_sander(n_out, input_shape=INPUT_SHAPE, out_activation='softmax'): """A simplified model of End-to-end learning for music audio, Sander Dieleman and Benjamin Schrauwen, ICASSP, 2014 Symbolic summary: > c1 - p1 - c1 - p1 - c1 - p1 - p3 - d1 Modifications: * Add BatchNormalization * n_mels (128 -> 32) * n_layers (2 -> 3) * add GlobalAveragePooling2D Parameters ---------- n_out: integer, number of output nodes input_shape: tuple, an input shape, which doesn't include batch-axis. out_activation: activation function on the output """ model = Sequential() model.add( Melspectrogram(sr=SR, n_mels=64, power_melgram=2.0, return_decibel_melgram=True, input_shape=input_shape)) model.add(Conv2D(30, (32, 4), padding='valid')) # (None, 16, 1, N) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(MaxPooling2D((1, 4), padding='same')) model.add(Conv2D(30, (1, 4), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(MaxPooling2D((1, 4), padding='same')) model.add(Conv2D(30, (1, 4), padding='same')) model.add(BatchNormalization(axis=channel_axis)) model.add(Activation('relu')) model.add(MaxPooling2D((1, 4), padding='same')) model.add(GlobalAveragePooling2D()) model.add(Dense(n_out, activation=out_activation)) return model
def depth_separable_cnn(input_shape=(1, 16000), sr=16000, loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.adam()): model = Sequential() # A mel-spectrogram layer model.add( Melspectrogram(n_dft=512, n_hop=512, input_shape=input_shape, padding='same', sr=sr, n_mels=128, fmin=0.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='trainable_stft')) # Maybe some additive white noise. model.add(AdditiveNoise(power=0.1)) # If you wanna normalise it per-frequency model.add(Normalization2D( str_axis='freq')) # or 'channel', 'time', 'batch', 'data_sample' # After this, it's just a usual keras workflow. For example.. # Add some layers, e.g., model.add(some convolution layers..) # Compile the model model.add(Conv2D(64, kernel_size=(20, 8), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2), dim_ordering="th")) model.add(Dropout(0.25)) ## Depth Seprable Pooling Layer - start model.add( SeparableConv2D(64, kernel_size=(5, 5), activation='relu', dim_ordering="th")) model.add(BatchNormalization()) model.add( Conv2D(64, kernel_size=(1, 1), activation='relu', dim_ordering="th")) model.add(BatchNormalization()) model.add(SeparableConv2D(64, kernel_size=(5, 5), activation='relu')) model.add(BatchNormalization()) model.add(Conv2D(64, kernel_size=(1, 1), activation='relu')) model.add(BatchNormalization()) ## Depth Seprable pooling Layer - end model.add(AveragePooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(12, activation='softmax')) model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) return model
def train(): pool_size = (2, 2) # 350 samples input_shape = (channelCount, sampleCount) sr = 44100 model = Sequential() model.add(Melspectrogram(n_dft=512, n_hop=256, input_shape=input_shape, padding='same', sr=sr, n_mels=128, fmin=0.0, fmax=sr/2, power_melgram=1.0, return_decibel_melgram=False, trainable_fb=False, trainable_kernel=False, name='trainable_stft')) model.add(AdditiveNoise(power=0.2)) model.add(Normalization2D(str_axis='freq')) # or 'channel', 'time', 'batch', 'data_sample' model.add(Convolution2D(32, 3, 3)) model.add(BatchNormalization(axis=1 )) model.add(ELU(alpha=1.0)) model.add(MaxPooling2D(pool_size=pool_size)) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(len(get_class_names()))) model.add(Activation("softmax")) model.compile('adam', 'categorical_crossentropy') x,y=loadData(trainDataPath) checkpoint_filepath = 'weights.hdf5' print("Looking for previous weights...") if ( os.path.isfile(checkpoint_filepath) ): print ('Checkpoint file detected. Loading weights.') model.load_weights(checkpoint_filepath) else: print ('No checkpoint file detected. Starting from scratch.') checkpointer = ModelCheckpoint(filepath=checkpoint_filepath, verbose=1, save_best_only=True) test_x,test_y=loadData(testDataPath) model.fit(x, y,batch_size=128, nb_epoch=100,verbose=1,validation_data=(test_x, test_y), callbacks=[checkpointer]) model.save(modelName)
def conv1d(input_shape, sr): i = layers.Input(shape=input_shape, name='input') x = Melspectrogram(n_dft=N_DFT, n_hop=HOP_LENGTH, padding='same', sr=sr, n_mels=N_MELS, fmin=0.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2, 1, 3), name='permute')(x) x = TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x) x = TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x) x = TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x) x = TimeDistributed(layers.Conv1D(64, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_3')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_4')(x) x = TimeDistributed(layers.Conv1D(128, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_4')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_5')(x) x = TimeDistributed(layers.Conv1D(256, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_5')(x) x = TimeDistributed(layers.Conv1D(512, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_6')(x) x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(NUM_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='1d_convolution') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def convolution_speech_model(num_category, sampling_rate=16000, input_length=16000): inputs = layers.Input((input_length, )) x = layers.Reshape((1, -1))(inputs) x = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, input_length), padding='same', sr=sampling_rate, n_mels=80, fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft')(x) x = Normalization2D(int_axis=0)(x) x = layers.Permute((2, 1, 3))(x) c1 = layers.Conv2D(20, (5, 1), activation='relu', padding='same')(x) c1 = layers.BatchNormalization()(c1) p1 = layers.MaxPooling2D((2, 1))(c1) p1 = layers.Dropout(0.03)(p1) c2 = layers.Conv2D(40, (3, 3), activation='relu', padding='same')(p1) c2 = layers.BatchNormalization()(c2) p2 = layers.MaxPooling2D((2, 2))(c2) p2 = layers.Dropout(0.01)(p2) c3 = layers.Conv2D(80, (3, 3), activation='relu', padding='same')(p2) c3 = layers.BatchNormalization()(c3) p3 = layers.MaxPooling2D((2, 2))(c3) p3 = layers.Flatten()(p3) p3 = layers.Dense(64, activation='relu')(p3) p3 = layers.Dense(32, activation='relu')(p3) output = layers.Dense(num_category, activation='softmax')(p3) model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel') return model
def _test_stereo_same(): """Tests for - stereo input - same padding - shapes of output channel, n_freq, n_frame - save and load a model with it """ n_ch = 2 sr = 8000 n_mels = 64 fmin, fmax = 200, sr // 2 n_dft, len_hop, nsp_src = 512, 256, 8000 if image_data_format() == 'channels_last': src = np.random.uniform(-1.0, 1.0, (nsp_src, n_ch)) else: src = np.random.uniform(-1.0, 1.0, (n_ch, nsp_src)) model = tensorflow.keras.models.Sequential() model.add( Melspectrogram( sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax, n_dft=n_dft, n_hop=len_hop, padding='same', power_melgram=1.0, return_decibel_melgram=False, image_data_format='default', input_shape=(nsp_src, n_ch) if image_data_format() == 'channels_last' else (n_ch, nsp_src), )) batch_melgram_kapre = model.predict(src[np.newaxis, ...]) if image_data_format() == 'channels_last': assert batch_melgram_kapre.shape[3] == n_ch assert batch_melgram_kapre.shape[1] == n_mels assert batch_melgram_kapre.shape[2] == _num_frame_same( nsp_src, len_hop) else: assert batch_melgram_kapre.shape[1] == n_ch assert batch_melgram_kapre.shape[2] == n_mels assert batch_melgram_kapre.shape[3] == _num_frame_same( nsp_src, len_hop)
def build_kapre_model(input_shape, sr, lr, summary=False): model = Sequential() model.add( Melspectrogram(n_dft=256, n_hop=256, input_shape=input_shape, padding='same', sr=sr, n_mels=96, fmin=0.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=False, trainable_fb=False, trainable_kernel=False, name='mel')) model.add( SeparableConv2D(128, (3, 3), padding='same', input_shape=input_shape)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 4))) model.add(Dropout(0.5)) model.add(SeparableConv2D(384, (3, 3), padding='same')) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(4, 5))) model.add(Dropout(0.5)) model.add(SeparableConv2D(768, (3, 3), padding='same')) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(3, 8))) model.add(Dropout(0.5)) model.add(SeparableConv2D(2048, (3, 3), padding='same')) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(4, 8))) model.add(Dropout(0.5)) model.add(Flatten()) model.add(Dense(3, activation='linear')) model.compile(loss=losses.mean_squared_error, optimizer=optimizers.Adam(lr=lr)) if summary: model.summary() return model
def test_plot(): SR = 16000 src = np.random.random((1, SR * 3)) src_cute, _ = librosa.load( '/Users/admin/Dropbox/workspace/unet/data/audio/abjones_1_01.wav', sr=SR, mono=True) model = Sequential() model.add( Melspectrogram(sr=SR, n_mels=128, n_dft=512, n_hop=256, input_shape=src.shape, return_decibel_melgram=True, trainable_kernel=True, name='melgram')) check_model(model) visualise_model(model) SR = 16000 src = np.random.random((1, SR * 3)) model = Sequential() model.add( Spectrogram(n_dft=512, n_hop=256, input_shape=src.shape, return_decibel_spectrogram=False, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) check_model(model) plt.figure(figsize=(14, 4)) plt.subplot(1, 2, 1) plt.title('log-Spectrogram by Kapre') visualise_model(model, logam=True) plt.subplot(1, 2, 2) display.specshow(librosa.amplitude_to_db(np.abs( librosa.stft(src_cute[:SR * 3], 512, 256))**2, ref=1.0), y_axis='linear', sr=SR) plt.title('log-Spectrogram by Librosa') plt.show()