def raw_vgg(args, input_length=12000 * 29, tf='melgram', normalize=None, decibel=False, last_layer=True, sr=None): ''' when length = 12000*29 and 512/256 dft/hop, melgram size: (n_mels, 1360) ''' assert tf in ('stft', 'melgram') assert normalize in (None, False, 'no', 0, 0.0, 'batch', 'data_sample', 'time', 'freq', 'channel') assert isinstance(decibel, bool) if sr is None: sr = SR # assumes 12000 conv_until = args.conv_until trainable_kernel = args.trainable_kernel model = Sequential() # decode args fmin = args.fmin fmax = args.fmax if fmax == 0.0: fmax = sr / 2 n_mels = args.n_mels trainable_fb = args.trainable_fb model.add( Melspectrogram(n_dft=512, n_hop=256, power_melgram=2.0, input_shape=(1, input_length), trainable_kernel=trainable_kernel, trainable_fb=trainable_fb, return_decibel_melgram=decibel, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax, name='melgram')) poolings = [(2, 4), (3, 4), (2, 5), (2, 4), (4, 4)] if normalize in ('batch', 'data_sample', 'time', 'freq', 'channel'): model.add(Normalization2D(normalize)) model.add( get_convBNeluMPdrop(5, [32, 32, 32, 32, 32], [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3)], poolings, model.output_shape[1:], conv_until=conv_until)) if conv_until != 4: model.add(GlobalAveragePooling2D()) else: model.add(Flatten()) if last_layer: model.add(Dense(1, activation='linear')) return model
def create_model(self, input_shape, nb_classes, n_filters, dropout, **kwargs): model = Sequential() dropout = list(map(float, dropout.split(","))) if len(dropout) == 1: dropout = dropout * 4 if len(dropout) != 4: raise Exception("Unexpected length of dropouts:{0}".format( len(dropout))) layers = [ Normalization2D(str_axis='batch', input_shape=input_shape), Conv2D(filters=n_filters, kernel_size=(9, 9), activation='relu'), MaxPool2D((2, 2), strides=(2, 2)), Dropout(dropout[0]), Conv2D(filters=n_filters, kernel_size=(6, 6), activation='relu'), Dropout(dropout[1]), Conv2D(filters=n_filters, kernel_size=(3, 3), activation='relu'), MaxPool2D((1, 2), strides=(1, 2)), Dropout(dropout[1]), Conv2D(filters=n_filters, kernel_size=(3, 3), activation='relu'), Dropout(dropout[1]), Flatten(), Dense(1000, activation='relu'), Dropout(dropout[2]), Dense(1000, activation='relu'), Dropout(dropout[3]), Dense(nb_classes, activation='sigmoid') ] add_regularization(layers, kwargs) for l in layers: model.add(l) return model
def Conv2D(N_CLASSES=10, SR=16000, DT=1.0): i = layers.Input(shape=(1, int(SR*DT)), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=SR, n_mels=128, fmin=0.0, fmax=SR/2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Conv2D(8, kernel_size=(7,7), activation='tanh', padding='same', name='conv2d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_1')(x) x = layers.Conv2D(16, kernel_size=(5,5), activation='relu', padding='same', name='conv2d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_2')(x) x = layers.Conv2D(16, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_2')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_3')(x) x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_3')(x) x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_4')(x) x = layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_4')(x) x = layers.Flatten(name='flatten')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='2d_convolution') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def LSTM(N_CLASSES=10, SR=16000, DT=1.0): i = layers.Input(shape=(1, int(SR*DT)), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=SR, n_mels=128, fmin=0.0, fmax=SR/2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2,1,3), name='permute')(x) x = TimeDistributed(layers.Reshape((-1,)), name='reshape')(x) s = TimeDistributed(layers.Dense(64, activation='tanh'), name='td_dense_tanh')(x) x = layers.Bidirectional(layers.LSTM(32, return_sequences=True), name='bidirectional_lstm')(s) x = layers.concatenate([s, x], axis=2, name='skip_connection') x = layers.Dense(64, activation='relu', name='dense_1_relu')(x) x = layers.MaxPooling1D(name='max_pool_1d')(x) x = layers.Dense(32, activation='relu', name='dense_2_relu')(x) x = layers.Flatten(name='flatten')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(32, activation='relu', activity_regularizer=l2(0.001), name='dense_3_relu')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='long_short_term_memory') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def Conv1D(N_CLASSES=3, SR=5000, DT=0.25): i = layers.Input(shape=(1, int(SR*DT)), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=SR, n_mels=128, fmin=0.0, fmax=SR/2, power_melgram=2.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2,1,3), name='permute')(x) x = TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2,2), name='max_pool_2d_1')(x) x = TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2,2), name='max_pool_2d_2')(x) x = TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x) x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x) x = layers.Dropout(rate=0.1, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='1d_convolution') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def AttRNNSpeechModel(nCategories, samplingrate=16000, inputLength=16000, rnn_func=L.LSTM): # simple LSTM sr = samplingrate iLen = inputLength inputs = L.Input((inputLength,), name='input') x = L.Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen), padding='same', sr=sr, n_mels=80, fmin=40.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='mel_stft_norm')(x) # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) # we would rather have it the other way around for LSTMs x = L.Permute((2, 1, 3))(x) x = L.Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = L.BatchNormalization()(x) x = L.Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = L.BatchNormalization()(x) # x = Reshape((125, 80)) (x) # keras.backend.squeeze(x, axis) x = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x) x = L.Bidirectional(rnn_func(64, return_sequences=True) )(x) # [b_s, seq_len, vec_dim] x = L.Bidirectional(rnn_func(64, return_sequences=True) )(x) # [b_s, seq_len, vec_dim] xFirst = L.Lambda(lambda q: q[:, -1])(x) # [b_s, vec_dim] query = L.Dense(128)(xFirst) # dot product attention attScores = L.Dot(axes=[1, 2])([query, x]) attScores = L.Softmax(name='attSoftmax')(attScores) # [b_s, seq_len] # rescale sequence attVector = L.Dot(axes=[1, 1])([attScores, x]) # [b_s, vec_dim] x = L.Dense(64, activation='relu')(attVector) x = L.Dense(32)(x) output = L.Dense(nCategories, activation='softmax', name='output')(x) model = Model(inputs=[inputs], outputs=[output]) return model
def __spec_model(self, input_shape, decibel_gram): model = Sequential() model.add(Spectrogram( return_decibel_spectrogram = decibel_gram, input_shape=input_shape )) model.add(Normalization2D(str_axis='freq')) return model
def build_model_vggish(classes, dropout_final=0.2, shape=(None, 320000), sr=16000, rnn_type='gru', rnn_units=256, focal_alpha=0.95, rnn_layers=1, rnn_dropout=0.2, activation='elu', random_noise=0.2, weights='soundnet'): inputs = keras.Input(shape=shape[1:]) x = keras.layers.Reshape(target_shape=(1, -1))(inputs) x = Melspectrogram(n_dft=512, n_hop=256, padding='same', sr=sr, n_mels=64, fmin=125, fmax=7500, power_melgram=1.0, return_decibel_melgram=True, name='trainable_stft')(x) if random_noise: x = AdditiveNoise(power=random_noise, random_gain=True)(x) x = Normalization2D(str_axis='freq')(x) x = Lambda(lambda x: K.permute_dimensions(x=x, pattern=(0, 2, 1, 3)), name="transpose")(x) vggish = VGGish(include_top=False, load_weights=weights, input_shape=x.get_shape().as_list()[1:], pooling=None) if weights is not None: # only freeze when using pretrained layers for layer in vggish.layers: layer.trainable = False x = vggish(x) x = keras.layers.AveragePooling2D(pool_size=(1, 4))(x) x = keras.layers.Reshape(target_shape=(-1, 512))(x) outputs = rnn_classifier_branch(x, name='rnn', dropout=rnn_dropout, dropout_final=dropout_final, rnn_units=rnn_units, rnn_type=rnn_type, n_classes=len(classes), rnn_layers=rnn_layers) model = keras.Model(inputs=inputs, outputs=outputs, name='crnn') model.summary() return model, vggish
def stft_model(audio_len, normalize=True, **kwargs): """Build an STFT preprocessing model. Pass normalize=False to disable the normalization layer. Pass arguments to https://github.com/keunwoochoi/kapre/blob/master/kapre/time_frequency.py#L11.""" return Sequential([ Spectrogram(input_shape=(1, audio_len), **kwargs), ] + ([ Normalization2D(str_axis='freq'), ] if normalize else []))
def ConvSpeechModel(nCategories, samplingrate=16000, inputLength=16000): """ Base fully convolutional model for speech recognition """ inputs = Input((inputLength, )) x = Reshape((1, -1))(inputs) x = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, inputLength), padding='same', sr=samplingrate, n_mels=80, fmin=40.0, fmax=samplingrate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft')(x) x = Normalization2D(int_axis=0)(x) #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) #we would rather have it the other way around for LSTMs x = Permute((2, 1, 3))(x) #x = Reshape((94,80)) (x) #this is strange - but now we have (batch_size, sequence, vec_dim) c1 = Conv2D(20, (5, 1), activation='relu', padding='same')(x) c1 = BatchNormalization()(c1) p1 = MaxPooling2D((2, 1))(c1) p1 = Dropout(0.03)(p1) c2 = Conv2D(40, (3, 3), activation='relu', padding='same')(p1) c2 = BatchNormalization()(c2) p2 = MaxPooling2D((2, 2))(c2) p2 = Dropout(0.01)(p2) c3 = Conv2D(80, (3, 3), activation='relu', padding='same')(p2) c3 = BatchNormalization()(c3) p3 = MaxPooling2D((2, 2))(c3) p3 = Flatten()(p3) p3 = Dense(64, activation='relu')(p3) p3 = Dense(32, activation='relu')(p3) output = Dense(nCategories, activation='softmax')(p3) #output = Dense(nCategories, activation = 'softmax')(p1) model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel') return model
def attention_speech_model(num_category, sampling_rate=16000, input_length=16000): inputs = layers.Input((input_length, ), name='input') x = layers.Reshape((1, -1))(inputs) m = Melspectrogram(input_shape=(1, input_length), n_dft=1024, n_hop=128, padding='same', sr=sampling_rate, n_mels=80, fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_tft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='norm')(x) x = layers.Permute((2, 1, 3))(x) x = layers.Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = layers.LeakyReLU()(x) x = layers.BatchNormalization()(x) x = layers.Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = layers.BatchNormalization()(x) x = layers.Lambda(lambda t: K.squeeze(t, -1), name='squeeze_last_dim')(x) x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) x_first = layers.Lambda(lambda t: t[:, t.shape[1] // 2])(x) query = layers.Dense(128)(x_first) attention_scores = layers.Dot([1, 2])([query, x]) attention_scores = layers.Softmax( name='attention_softmax')(attention_scores) attention_vector = layers.Dot(axes=[1, 1])([attention_scores, x]) x = layers.Dense(64)(attention_vector) x = layers.LeakyReLU()(x) x = layers.Dropout(0.5)(x) x = layers.Dense(32)(x) x = layers.Dropout(0.5)(x) out = layers.Dense(num_category, activation='softmax', name="output")(x) model = Model(inputs=inputs, outputs=out) return model
def Listen(input_length, parametres_melspectgtom, parametres_CNN, parametres_BRNN): ''' parametres_melspectgtom : parametre de la couche Melspectrogram si length = 0 donc les audios sont déja mfcc parametres_BRNN : a list that contains the parameters of the cells for each bidectionnel layer then number_layers is the len of this list parametres_BRNN parametres_CNN: parametres used to build the CNN network after the inputs input_length is the len of the input audios ''' number_layers = len(parametres_BRNN) encoder_inputs = L.Input(shape=(input_length, )) if parametres_melspectgtom["mfccs"] == False: #MELSPECTROGRAM Layer encoder_inputs = L.Input(shape=(input_length, )) encoder = L.Reshape((1, -1))(encoder_inputs) m = Build_MelSpectrogram(parametres_melspectgtom, input_length) encoder = m(encoder) encoder = Normalization2D(name='mel_stft_norm', str_axis='freq')(encoder) # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) # we would rather have it the other way around for LSTMs (batch_size,timeSteps,melDim,1) encoder = L.Permute((2, 1, 3))(encoder) encoder = BuildCNN(parametres_CNN, encoder) encoder = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(encoder) else: encoder_inputs = L.Input(shape=(517, 13, 1)) encoder = BuildCNN(parametres_CNN, encoder_inputs) encoder = L.Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(encoder) #dans le cas ou nous avons des mfcc inputs = encoder encoder_state_fbw = None for parametre in parametres_BRNN: print(parametre) bltsm_layer = Build_Bidiractionnel_layer(parametre) encoder_outputs, forward_h, forward_c, backward_h, backward_c = bltsm_layer( inputs, initial_state=encoder_state_fbw) state_h = L.Concatenate()([forward_h, backward_h]) state_c = L.Concatenate()([forward_c, backward_c]) encoder_state_fbw = [forward_h, backward_h, forward_c, backward_c] inputs = L.Dropout(0.1)(encoder_outputs) print("end") #encoder_state = tuple(encoder_state_fbw * number_layers ) print("shape of encoder_outupts ", encoder_outputs.shape) print("shape of encode_states ", state_h.shape, state_c.shape) return encoder_inputs, encoder_outputs, encoder_state_fbw
def RNNSpeechModel(nCategories, samplingrate=16000, inputLength=16000): #simple LSTM sr = samplingrate iLen = inputLength inputs = Input((iLen, )) x = Reshape((1, -1))(inputs) x = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen), padding='same', sr=sr, n_mels=80, fmin=40.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft')(x) x = Normalization2D(int_axis=0)(x) #note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) #we would rather have it the other way around for LSTMs x = Permute((2, 1, 3))(x) x = Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) #x = Reshape((125, 80)) (x) x = Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x) #keras.backend.squeeze(x, axis) #x = Bidirectional(CuDNNLSTM(64, return_sequences = True)) (x) # [b_s, seq_len, vec_dim] #x = Bidirectional(CuDNNLSTM(64)) (x) x = Bidirectional(LSTM(64, return_sequences=True))(x) x = Bidirectional(LSTM(64))(x) x = Dense(64, activation='relu')(x) x = Dense(32, activation='relu')(x) output = Dense(nCategories, activation='softmax')(x) model = Model(inputs=[inputs], outputs=[output]) return model
def __mel_spec_model(self, input_shape, n_mels, power_melgram, decibel_gram): model = Sequential() model.add(Melspectrogram( sr=self._sr, n_mels=n_mels, power_melgram=power_melgram, return_decibel_melgram = decibel_gram, input_shape=input_shape, trainable_fb=False )) model.add(Normalization2D(str_axis='freq')) return model
def Conv1D(input_length, num_classes): i = layers.Input(shape=(1, input_length), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=16000, n_mels=128, fmin=0.0, fmax=16000 / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2, 1, 3), name='permute')(x) x = layers.TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x) x = layers.TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x) x = layers.TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x) x = layers.TimeDistributed(layers.Conv1D(64, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_3')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_4')(x) x = layers.TimeDistributed(layers.Conv1D(128, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_4')(x) x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x) x = layers.Dropout(rate=0.1, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(num_classes, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='1d_convolution') return model
def depth_separable_cnn(input_shape=(1, 16000), sr=16000, loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.adam()): model = Sequential() # A mel-spectrogram layer model.add( Melspectrogram(n_dft=512, n_hop=512, input_shape=input_shape, padding='same', sr=sr, n_mels=128, fmin=0.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='trainable_stft')) # Maybe some additive white noise. model.add(AdditiveNoise(power=0.1)) # If you wanna normalise it per-frequency model.add(Normalization2D( str_axis='freq')) # or 'channel', 'time', 'batch', 'data_sample' # After this, it's just a usual keras workflow. For example.. # Add some layers, e.g., model.add(some convolution layers..) # Compile the model model.add(Conv2D(64, kernel_size=(20, 8), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2), dim_ordering="th")) model.add(Dropout(0.25)) ## Depth Seprable Pooling Layer - start model.add( SeparableConv2D(64, kernel_size=(5, 5), activation='relu', dim_ordering="th")) model.add(BatchNormalization()) model.add( Conv2D(64, kernel_size=(1, 1), activation='relu', dim_ordering="th")) model.add(BatchNormalization()) model.add(SeparableConv2D(64, kernel_size=(5, 5), activation='relu')) model.add(BatchNormalization()) model.add(Conv2D(64, kernel_size=(1, 1), activation='relu')) model.add(BatchNormalization()) ## Depth Seprable pooling Layer - end model.add(AveragePooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(12, activation='softmax')) model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) return model
def train(): pool_size = (2, 2) # 350 samples input_shape = (channelCount, sampleCount) sr = 44100 model = Sequential() model.add(Melspectrogram(n_dft=512, n_hop=256, input_shape=input_shape, padding='same', sr=sr, n_mels=128, fmin=0.0, fmax=sr/2, power_melgram=1.0, return_decibel_melgram=False, trainable_fb=False, trainable_kernel=False, name='trainable_stft')) model.add(AdditiveNoise(power=0.2)) model.add(Normalization2D(str_axis='freq')) # or 'channel', 'time', 'batch', 'data_sample' model.add(Convolution2D(32, 3, 3)) model.add(BatchNormalization(axis=1 )) model.add(ELU(alpha=1.0)) model.add(MaxPooling2D(pool_size=pool_size)) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(len(get_class_names()))) model.add(Activation("softmax")) model.compile('adam', 'categorical_crossentropy') x,y=loadData(trainDataPath) checkpoint_filepath = 'weights.hdf5' print("Looking for previous weights...") if ( os.path.isfile(checkpoint_filepath) ): print ('Checkpoint file detected. Loading weights.') model.load_weights(checkpoint_filepath) else: print ('No checkpoint file detected. Starting from scratch.') checkpointer = ModelCheckpoint(filepath=checkpoint_filepath, verbose=1, save_best_only=True) test_x,test_y=loadData(testDataPath) model.fit(x, y,batch_size=128, nb_epoch=100,verbose=1,validation_data=(test_x, test_y), callbacks=[checkpointer]) model.save(modelName)
def conv1d(input_shape, sr): i = layers.Input(shape=input_shape, name='input') x = Melspectrogram(n_dft=N_DFT, n_hop=HOP_LENGTH, padding='same', sr=sr, n_mels=N_MELS, fmin=0.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2, 1, 3), name='permute')(x) x = TimeDistributed(layers.Conv1D(8, kernel_size=(4), activation='tanh'), name='td_conv_1d_tanh')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_1')(x) x = TimeDistributed(layers.Conv1D(16, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_1')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_2')(x) x = TimeDistributed(layers.Conv1D(32, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_2')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_3')(x) x = TimeDistributed(layers.Conv1D(64, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_3')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_4')(x) x = TimeDistributed(layers.Conv1D(128, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_4')(x) x = layers.MaxPooling2D(pool_size=(2, 2), name='max_pool_2d_5')(x) x = TimeDistributed(layers.Conv1D(256, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_5')(x) x = TimeDistributed(layers.Conv1D(512, kernel_size=(4), activation='relu'), name='td_conv_1d_relu_6')(x) x = layers.GlobalMaxPooling2D(name='global_max_pooling_2d')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(64, activation='relu', activity_regularizer=l2(0.001), name='dense')(x) o = layers.Dense(NUM_CLASSES, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='1d_convolution') model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def convolution_speech_model(num_category, sampling_rate=16000, input_length=16000): inputs = layers.Input((input_length, )) x = layers.Reshape((1, -1))(inputs) x = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, input_length), padding='same', sr=sampling_rate, n_mels=80, fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft')(x) x = Normalization2D(int_axis=0)(x) x = layers.Permute((2, 1, 3))(x) c1 = layers.Conv2D(20, (5, 1), activation='relu', padding='same')(x) c1 = layers.BatchNormalization()(c1) p1 = layers.MaxPooling2D((2, 1))(c1) p1 = layers.Dropout(0.03)(p1) c2 = layers.Conv2D(40, (3, 3), activation='relu', padding='same')(p1) c2 = layers.BatchNormalization()(c2) p2 = layers.MaxPooling2D((2, 2))(c2) p2 = layers.Dropout(0.01)(p2) c3 = layers.Conv2D(80, (3, 3), activation='relu', padding='same')(p2) c3 = layers.BatchNormalization()(c3) p3 = layers.MaxPooling2D((2, 2))(c3) p3 = layers.Flatten()(p3) p3 = layers.Dense(64, activation='relu')(p3) p3 = layers.Dense(32, activation='relu')(p3) output = layers.Dense(num_category, activation='softmax')(p3) model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel') return model
def Att_RNN_Speech(x_train, y_train, classes, sampling_rate=16000, input_length=16000, batch_size=32, epochs=3): inputs = Input((input_length,)) x = Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, input_length), padding='same', sr=sampling_rate, n_mels=80, fmin=40.0, fmax=sampling_rate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False) m.trainable = False x = m(x) x = Normalization2D(int_axis=0)(x) x = Permute((2, 1, 3))(x) x = Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Lambda(lambda q: squeeze(q, -1))(x) x = Bidirectional(LSTM(64, return_sequences=True))(x) x = Bidirectional(LSTM(64, return_sequences=True))(x) xFirst = Lambda(lambda q: q[:, -1])(x) query = Dense(128)(xFirst) attScores = Dot(axes=[1, 2])([query, x]) attScores = Softmax()(attScores) attVector = Dot(axes=[1, 1])([attScores, x]) x = Dense(64, activation='relu')(attVector) x = Dense(32)(x) output = Dense(classes, activation='softmax')(x) model = Model(inputs=[inputs], outputs=[output]) model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy']) model.summary() model.fit(x_train, validation_data=y_train, epochs=epochs, batch_size=batch_size, use_multiprocessing=False, workers=4, verbose=2) model.save('Att_RNN_Speech.model')
def build_CNN_model(self): ### define CNN architecture print('Build model...') self.model = Sequential() self.model.add(Spectrogram(n_dft=128, n_hop=16, input_shape=(self.x_augmented_rolled.shape[1:]), return_decibel_spectrogram=False, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) self.model.add(Normalization2D(str_axis = 'freq')) # Conv Block 1 self.model.add(Conv2D(filters = 24, kernel_size = (12, 12), strides = (1, 1), name = 'conv1', border_mode = 'same')) self.model.add(BatchNormalization(axis = 1)) self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2,2), padding = 'valid', data_format = 'channels_last')) self.model.add(Activation('relu')) self.model.add(Dropout(self.dropout)) # Conv Block 2 self.model.add(Conv2D(filters = 48, kernel_size = (8, 8), name = 'conv2', border_mode = 'same')) self.model.add(BatchNormalization(axis = 1)) self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2), padding = 'valid', data_format = 'channels_last')) self.model.add(Activation('relu')) self.model.add(Dropout(self.dropout)) # Conv Block 3 self.model.add(Conv2D(filters = 96, kernel_size = (4, 4), name = 'conv3', border_mode = 'same')) self.model.add(BatchNormalization(axis = 1)) self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2,2), padding = 'valid', data_format = 'channels_last')) self.model.add(Activation('relu')) self.model.add(Dropout(self.dropout)) # classificator self.model.add(Flatten()) self.model.add(Dense(self.n_classes)) # two classes only self.model.add(Activation('softmax')) print(self.model.summary()) self.saved_model_name = self.MODELNAME
def LSTM(input_length, num_classes): i = layers.Input(shape=(1, input_length), name='input') x = Melspectrogram(n_dft=512, n_hop=160, padding='same', sr=16000, n_mels=128, fmin=0.0, fmax=16000 / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='melbands')(i) x = Normalization2D(str_axis='batch', name='batch_norm')(x) x = layers.Permute((2, 1, 3), name='permute')(x) x = layers.TimeDistributed(layers.Reshape((-1, )), name='reshape')(x) s = layers.TimeDistributed(layers.Dense(64, activation='tanh'), name='td_dense_tanh')(x) x = layers.Bidirectional(layers.LSTM(32, return_sequences=True), name='bidirectional_lstm')(s) x = layers.concatenate([s, x], axis=2, name='skip_connection') x = layers.Dense(64, activation='relu', name='dense_1_relu')(x) x = layers.MaxPooling1D(name='max_pool_1d')(x) x = layers.Dense(32, activation='relu', name='dense_2_relu')(x) x = layers.Flatten(name='flatten')(x) x = layers.Dropout(rate=0.2, name='dropout')(x) x = layers.Dense(32, activation='relu', activity_regularizer=l2(0.001), name='dense_3_relu')(x) o = layers.Dense(num_classes, activation='softmax', name='softmax')(x) model = Model(inputs=i, outputs=o, name='long_short_term_memory') return model
def MLP_model(input_shape, dropout=0.5, print_summary=False): # basis of the CNN_STFT is a Sequential network model = Sequential() # spectrogram creation using STFT model.add( Spectrogram(n_dft=128, n_hop=16, input_shape=input_shape, return_decibel_spectrogram=False, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) model.add(Normalization2D(str_axis='freq')) model.add(Flatten()) model.add(Dense(neurons_per_layer, activation='relu', input_shape=(784, ))) model.add(Dropout(0.2)) # custom number of hidden layers for each in range(n_hidden_layers - 1): model.add(Dense(neurons_per_layer, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(2)) # two classes only model.add(Activation('softmax')) if print_summary: print(model.summary()) # compile the model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # assign model and return return model
def CNN_model(input_shape, dropout=0.5, print_summary=False): # basis of the CNN_STFT is a Sequential network model = Sequential() # spectrogram creation using STFT model.add( Spectrogram(n_dft=128, n_hop=16, input_shape=input_shape, return_decibel_spectrogram=False, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) model.add(Normalization2D(str_axis='freq')) # Conv Block 1 model.add( Conv2D(filters=24, kernel_size=(12, 12), strides=(1, 1), name='conv1', border_mode='same')) model.add(BatchNormalization(axis=1)) model.add(Activation('relu')) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid', data_format='channels_last')) # Conv Block 2 model.add( Conv2D(filters=48, kernel_size=(8, 8), name='conv2', border_mode='same')) model.add(BatchNormalization(axis=1)) model.add(Activation('relu')) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid', data_format='channels_last')) # Conv Block 3 model.add( Conv2D(filters=96, kernel_size=(4, 4), name='conv3', border_mode='same')) model.add(BatchNormalization(axis=1)) model.add(Activation('relu')) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid', data_format='channels_last')) model.add(Dropout(dropout)) # classificator model.add(Flatten()) model.add(Dense(2)) # two classes only model.add(Activation('softmax')) if print_summary: print(model.summary()) # compile the model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # assign model and return return model
def attRNN(): sr = 8000 inputs = Input((8000, 1), name='input') x = Reshape((1, -1))(inputs) m = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, 8000), padding='same', sr=sr, n_mels=80, fmin=40.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft') m.trainable = False x = m(x) x = Normalization2D(int_axis=0, name='mel_stft_norm')(x) # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) # we would rather have it the other way around for LSTMs x = Permute((2, 1, 3))(x) x = Conv2D(10, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv2D(1, (5, 1), activation='relu', padding='same')(x) x = BatchNormalization()(x) # x = Reshape((125, 80)) (x) # keras.backend.squeeze(x, axis) x = Lambda(lambda q: K.squeeze(q, -1), name='squeeze_last_dim')(x) x = Bidirectional(LSTM(64, return_sequences=True))( x) # [b_s, seq_len, vec_dim] x = Bidirectional(LSTM(64, return_sequences=True))( x) # [b_s, seq_len, vec_dim] xFirst = Lambda(lambda q: q[:, -1])(x) # [b_s, vec_dim] query = Dense(128)(xFirst) # dot product attention attScores = Dot(axes=[1, 2])([query, x]) attScores = Softmax(name='attSoftmax')(attScores) # [b_s, seq_len] # rescale sequence attVector = Dot(axes=[1, 1])([attScores, x]) # [b_s, vec_dim] x = Dense(64, activation='relu')(attVector) x = Dense(32)(x) output = Dense(9, activation='softmax', name='output')(x) model = Model(inputs=[inputs], outputs=[output]) model.compile(optimizer='adam', loss=['sparse_categorical_crossentropy'], metrics=['sparse_categorical_accuracy']) model.summary() return model
def ConvSpeechModel(nCategories, samplingrate=16000, inputLength=16000, more_blocks=False, bigger_blocks=False, blocks_layers=[20, 40, 80, 160, 320]): """ Base fully convolutional model for speech recognition """ inputs = L.Input((inputLength, )) x = L.Reshape((1, -1))(inputs) x = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, inputLength), padding='same', sr=samplingrate, n_mels=80, fmin=40.0, fmax=samplingrate / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft')(x) x = Normalization2D(int_axis=0)(x) # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1) # we would rather have it the other way around for LSTMs x = L.Permute((2, 1, 3))(x) # x = Reshape((94,80)) (x) #this is strange - but now we have (batch_size, # sequence, vec_dim) c1 = L.Conv2D(blocks_layers[0], (5, 1), activation='relu', padding='same')(x) c1 = L.BatchNormalization()(c1) p1 = L.MaxPooling2D((2, 1))(c1) p1 = L.Dropout(0.2)(p1) c2 = L.Conv2D(blocks_layers[1], (3, 3), activation='relu', padding='same')(p1) c2 = L.BatchNormalization()(c2) if bigger_blocks: c2 = L.Conv2D(blocks_layers[1], (3, 3), activation='relu', padding='same')(c2) c2 = L.BatchNormalization()(c2) p2 = L.MaxPooling2D((2, 2))(c2) p2 = L.Dropout(0.3)(p2) c3 = L.Conv2D(blocks_layers[2], (3, 3), activation='relu', padding='same')(p2) c3 = L.BatchNormalization()(c3) if bigger_blocks: c3 = L.Conv2D(blocks_layers[2], (3, 3), activation='relu', padding='same')(c3) c3 = L.BatchNormalization()(c3) p3 = L.MaxPooling2D((2, 2))(c3) # p3 = L.Dropout(0.3)(p3) if more_blocks: p3 = L.Dropout(0.3)(p3) c3 = L.Conv2D(blocks_layers[3], (3, 3), activation='relu', padding='same')(p3) c3 = L.BatchNormalization()(c3) if bigger_blocks: c3 = L.Conv2D(blocks_layers[3], (3, 3), activation='relu', padding='same')(c3) c3 = L.BatchNormalization()(c3) p3 = L.MaxPooling2D((2, 2))(c3) p3 = L.Flatten()(p3) p3 = L.Dense(64, activation='relu')(p3) p3 = L.Dropout(0.4)(p3) output = L.Dense(nCategories, activation='softmax')(p3) model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel') return model
from kapre.utils import Normalization2D from kapre.augmentation import AdditiveNoise import keras from keras import optimizers import tensorflow as tf from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split classifier = Sequential() classifier.add(Spectrogram(n_dft=512, n_hop=256,padding='same',input_shape=input_shape, power_spectrogram=2.0,return_decibel_spectrogram=False, trainable_kernel=False,image_data_format='default')) classifier.add(AdditiveNoise(power=0.2)) classifier.add(Normalization2D(str_axis='freq')) #Layer 1 classifier.add(Conv2D(24, (1, 1), input_shape = (7192,11, 1000), activation = 'relu')) keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None) classifier.add(MaxPooling2D(pool_size = (2, 2))) classifier.add(Dense(units = 128, activation = 'relu')) keras.layers.Dropout(0.5, noise_shape=None, seed=None) #Layer 2 classifier.add(Conv2D(48, (1,1), input_shape = (32, 32,24), activation = 'relu')) keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None) classifier.add(MaxPooling2D(pool_size = (2, 2))) classifier.add(Dense(units = 128, activation = 'relu')) keras.layers.Dropout(0.5, noise_shape=None, seed=None) #Layer 3 classifier.add(Conv2D(96, (1,1), input_shape = (32, 32,24), activation = 'relu')) keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
melspecModel.add( Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, iLen), padding='same', sr=sr, n_mels=80, fmin=40.0, fmax=sr / 2, power_melgram=1.0, return_decibel_melgram=True, trainable_fb=False, trainable_kernel=False, name='mel_stft')) melspecModel.add(Normalization2D(int_axis=0)) melspecModel.summary() # In[45]: #melspec = melspecModel.predict( audios.reshape((-1,1,iLen)) ) #melspec.shape # # Models # # Create Keras models to see if the generators are working properly # In[46]: from keras.models import Model, load_model
def add_mel_to_VGGish(content_weights_file_path_og, input_length, sr_hr, n_mels, hoplength, nfft, fmin, fmax, power_melgram): NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. EMBEDDING_SIZE = 128 # Size of embedding layer. pooling = 'avg' X = Input(shape=(NUM_FRAMES, NUM_BANDS, 1), name='nob') x = X x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv1')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x) # Block 2 x = Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x) # Block 3 x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_1')(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x) # Block 4 x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_1')(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x) if pooling == 'avg': x = GlobalAveragePooling2D()(x) elif pooling == 'max': x = GlobalMaxPooling2D()(x) model = Model(inputs=X, outputs=x) model.load_weights(content_weights_file_path_og) X = Input(shape=(1, input_length), name='input_1') x = X x = Spectrogram(n_dft=nfft, n_hop=hoplength, padding='same', return_decibel_spectrogram=True, trainable_kernel=False, name='stft')(x) x = Normalization2D(str_axis='freq')(x) no_input_layers = model.layers[1:] for layer in no_input_layers: x = layer(x) return Model(inputs=X, outputs=x)
def create_VGGish(input_length, sr_hr, n_mels, hoplength, nfft, fmin, fmax, power_melgram, pooling='avg'): X = Input(shape=(input_length, 1), name='input_1') x = X x = Reshape((1, input_length))(x) x = Spectrogram(n_dft=nfft, n_hop=hoplength, padding='same', return_decibel_spectrogram=True, trainable_kernel=False, name='stft')(x) x = Normalization2D(str_axis='freq')(x) x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv1')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x) x = Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_1')(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_1')(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x) if pooling == 'avg': x = GlobalAveragePooling2D()(x) elif pooling == 'max': x = GlobalMaxPooling2D()(x) return X, x