def preProcessingSample(win_length, window=False, preEmphasis=False, spec=False, n_fft=None, n_hop=None, log=False, power=1.0): x = Input(shape=(win_length, 1)) y = x if window: y = Lambda((Models.Window), name='output')(y) if preEmphasis: y = Lambda((Models.PreEmphasis), name='preEmph')(y) if spec: spectrogram = Spectrogram(n_dft=n_fft, n_hop=n_hop, input_shape=(1, win_length), return_decibel_spectrogram=log, power_spectrogram=power, trainable_kernel=False, name='spec') y_D = Lambda((Models.toPermuteDimensions), name='perm_mel')(y) y_D = spectrogram(y_D) model = Model(inputs=[x], outputs=[y_D, y]) else: model = Model(inputs=[x], outputs=[y]) return model
def _test_stereo_same(): """Tests for - stereo input - same padding - shapes of output channel, n_freq, n_frame - save and load a model with it """ n_ch = 2 n_dft, len_hop, nsp_src = 512, 256, 8000 src = np.random.uniform(-1., 1., (n_ch, nsp_src)) model = keras.models.Sequential() model.add( Spectrogram(n_dft=n_dft, n_hop=len_hop, padding='same', power_spectrogram=1.0, return_decibel_spectrogram=False, image_data_format='default', input_shape=(n_ch, nsp_src))) batch_stft_kapre = model.predict(src[np.newaxis, :]) # check num_channel if image_data_format() == 'channels_last': assert batch_stft_kapre.shape[3] == n_ch assert batch_stft_kapre.shape[1] == n_dft // 2 + 1 assert batch_stft_kapre.shape[2] == _num_frame_same( nsp_src, len_hop) else: assert batch_stft_kapre.shape[1] == n_ch assert batch_stft_kapre.shape[2] == n_dft // 2 + 1 assert batch_stft_kapre.shape[3] == _num_frame_same( nsp_src, len_hop)
def __spec_model(self, input_shape, decibel_gram): model = Sequential() model.add(Spectrogram( return_decibel_spectrogram = decibel_gram, input_shape=input_shape )) model.add(Normalization2D(str_axis='freq')) return model
def assemble_model( src: np.ndarray, n_outputs: int, arch_layers: list, n_dft: int = 512, # Orig:128 n_hop: int = 256, # Orig:64 data_format: str = "channels_first", ) -> keras.Model: inputs = keras.Input(shape=src.shape, name="stft") # @paper: Spectrogram based CNN that receives the (log) spectrogram matrix as input # @kapre: # abs(Spectrogram) in a shape of 2D data, i.e., # `(None, n_channel, n_freq, n_time)` if `'channels_first'`, # `(None, n_freq, n_time, n_channel)` if `'channels_last'`, x = Spectrogram( n_dft=n_dft, n_hop=n_hop, input_shape=src.shape, trainable_kernel=True, name="static_stft", image_data_format=data_format, return_decibel_spectrogram=True, )(inputs) # Swaps order to match the paper? # TODO: dig in to this (GPU only?) if data_format == "channels_first": # n_channel, n_freq, n_time) x = keras.layers.Permute((1, 3, 2))(x) else: x = keras.layers.Permute((2, 1, 3))(x) for arch_layer in arch_layers: x = keras.layers.Conv2D( arch_layer.filters, arch_layer.window_size, strides=arch_layer.strides, activation=arch_layer.activation, data_format=data_format, )(x) # Flatten down to a single dimension x = keras.layers.Flatten()(x) # @paper: sigmoid activations with binary cross entropy loss # @paper: FC-512 x = keras.layers.Dense(512)(x) # @paper: FC-368(sigmoid) outputs = keras.layers.Dense(n_outputs, activation="sigmoid", name="predictions")(x) return keras.Model(inputs=inputs, outputs=outputs)
def stft_model(audio_len, normalize=True, **kwargs): """Build an STFT preprocessing model. Pass normalize=False to disable the normalization layer. Pass arguments to https://github.com/keunwoochoi/kapre/blob/master/kapre/time_frequency.py#L11.""" return Sequential([ Spectrogram(input_shape=(1, audio_len), **kwargs), ] + ([ Normalization2D(str_axis='freq'), ] if normalize else []))
def make_kapre_mag_maker(n_fft=1024, hop_length=128, audio_data_len=80000): stft_model = keras.models.Sequential() stft_model.add( Spectrogram(n_dft=n_fft, n_hop=hop_length, input_shape=(1, audio_data_len), power_spectrogram=2.0, return_decibel_spectrogram=False, trainable_kernel=False, name='stft')) return stft_model
def _test_correctness(): """ Tests correctness """ audio_data = np.load('tests/speech_test_file.npz')['audio_data'] sr = 44100 hop_length = 128 n_fft = 1024 n_mels = 80 # compute with librosa S = librosa.core.stft(audio_data, n_fft=n_fft, hop_length=hop_length) magnitudes_librosa = librosa.magphase(S, power=2)[0] S_DB_librosa = librosa.power_to_db(magnitudes_librosa, ref=np.max) # load precomputed magnitudes_expected = np.load('tests/test_audio_stft_g0.npy') # compute with kapre stft_model = tensorflow.keras.models.Sequential() stft_model.add( Spectrogram( n_dft=n_fft, n_hop=hop_length, input_shape=(len(audio_data), 1) if image_data_format() == 'channels_last' else (1, len(audio_data)), power_spectrogram=2.0, return_decibel_spectrogram=False, trainable_kernel=False, name='stft', )) S = stft_model.predict( audio_data.reshape(1, -1, 1) if image_data_format() == 'channels_last' else audio_data.reshape(1, 1, -1)) if image_data_format() == 'channels_last': S = S[0, :, :, 0] else: S = S[0, 0] magnitudes_kapre = librosa.magphase(S, power=1)[0] S_DB_kapre = librosa.power_to_db(magnitudes_kapre, ref=np.max) DB_scale = np.max(S_DB_librosa) - np.min(S_DB_librosa) S_DB_dif = np.abs(S_DB_kapre - S_DB_librosa) / DB_scale assert np.allclose(magnitudes_expected, magnitudes_kapre, rtol=1e-2, atol=1e-8) assert np.mean(S_DB_dif) < 0.015
def test_plot(): SR = 16000 src = np.random.random((1, SR * 3)) src_cute, _ = librosa.load( '/Users/admin/Dropbox/workspace/unet/data/audio/abjones_1_01.wav', sr=SR, mono=True) model = Sequential() model.add( Melspectrogram(sr=SR, n_mels=128, n_dft=512, n_hop=256, input_shape=src.shape, return_decibel_melgram=True, trainable_kernel=True, name='melgram')) check_model(model) visualise_model(model) SR = 16000 src = np.random.random((1, SR * 3)) model = Sequential() model.add( Spectrogram(n_dft=512, n_hop=256, input_shape=src.shape, return_decibel_spectrogram=False, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) check_model(model) plt.figure(figsize=(14, 4)) plt.subplot(1, 2, 1) plt.title('log-Spectrogram by Kapre') visualise_model(model, logam=True) plt.subplot(1, 2, 2) display.specshow(librosa.amplitude_to_db(np.abs( librosa.stft(src_cute[:SR * 3], 512, 256))**2, ref=1.0), y_axis='linear', sr=SR) plt.title('log-Spectrogram by Librosa') plt.show()
def build_CNN_model(self): ### define CNN architecture print('Build model...') self.model = Sequential() self.model.add(Spectrogram(n_dft=128, n_hop=16, input_shape=(self.x_augmented_rolled.shape[1:]), return_decibel_spectrogram=False, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) self.model.add(Normalization2D(str_axis = 'freq')) # Conv Block 1 self.model.add(Conv2D(filters = 24, kernel_size = (12, 12), strides = (1, 1), name = 'conv1', border_mode = 'same')) self.model.add(BatchNormalization(axis = 1)) self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2,2), padding = 'valid', data_format = 'channels_last')) self.model.add(Activation('relu')) self.model.add(Dropout(self.dropout)) # Conv Block 2 self.model.add(Conv2D(filters = 48, kernel_size = (8, 8), name = 'conv2', border_mode = 'same')) self.model.add(BatchNormalization(axis = 1)) self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2), padding = 'valid', data_format = 'channels_last')) self.model.add(Activation('relu')) self.model.add(Dropout(self.dropout)) # Conv Block 3 self.model.add(Conv2D(filters = 96, kernel_size = (4, 4), name = 'conv3', border_mode = 'same')) self.model.add(BatchNormalization(axis = 1)) self.model.add(MaxPooling2D(pool_size = (2, 2), strides = (2,2), padding = 'valid', data_format = 'channels_last')) self.model.add(Activation('relu')) self.model.add(Dropout(self.dropout)) # classificator self.model.add(Flatten()) self.model.add(Dense(self.n_classes)) # two classes only self.model.add(Activation('softmax')) print(self.model.summary()) self.saved_model_name = self.MODELNAME
def assemble_model( src: np.ndarray, arch_layers: list, n_dft: int = 128, n_hop: int = 64, data_format: str = 'channels_first', ) -> keras.Model: inputs = keras.Input(shape=src.shape, name='stft') # @paper: Spectrogram based CNN that receives the (log) spectrogram matrix as input # @kapre: # abs(Spectrogram) in a shape of 2D data, i.e., # `(None, n_channel, n_freq, n_time)` if `'channels_first'`, # `(None, n_freq, n_time, n_channel)` if `'channels_last'`, x: Spectrogram = Spectrogram( n_dft=n_dft, n_hop=n_hop, input_shape=src.shape, trainable_kernel=True, name='static_stft', image_data_format=data_format, return_decibel_spectrogram=True, )(inputs) for arch_layer in arch_layers: x = keras.layers.Conv2D( arch_layer.filters, arch_layer.window_size, strides=arch_layer.strides, activation=arch_layer.activation, data_format=data_format, )(x) # @paper: sigmoid activations with binary cross entropy loss # @paper: FC-512 x = keras.layers.Dense(512)(x) # @paper: FC-368(sigmoid) outputs = keras.layers.Dense(368, activation='sigmoid', name='predictions')(x) return keras.Model(inputs=inputs, outputs=outputs)
def SqueezeNet(input_tensor=None, input_shape=(1, 44100 * 3), classes=len(classes)): inputs = Input(shape=input_shape) x = Spectrogram(n_dft=512, return_decibel_spectrogram=True)(inputs) x = AdditiveNoise(power=0.3, random_gain=True)(x) x = Convolution2D(64, (3, 3), strides=(2, 2), padding='valid', name='conv1')(x) x = PReLU(name='prelu_conv1')(x) x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool1')(x) # first simple bypass fire2 = fire_module(x, fire_id=2, squeeze=16, expand=64) fire3 = fire_module(fire2, fire_id=3, squeeze=16, expand=64) x = add([fire2, fire3]) x = fire_module(x, fire_id=4, squeeze=32, expand=128) # second simple bypass maxpool1 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool3')(x) fire5 = fire_module(maxpool1, fire_id=5, squeeze=32, expand=128) x = add([maxpool1, fire5]) # third simple bypass fire6 = fire_module(x, fire_id=6, squeeze=48, expand=192) fire7 = fire_module(fire6, fire_id=7, squeeze=48, expand=192) x = add([fire6, fire7]) x = fire_module(x, fire_id=8, squeeze=64, expand=256) maxpool2 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool5')(x) fire9 = fire_module(maxpool2, fire_id=9, squeeze=64, expand=256) x = add([maxpool2, fire9]) x = Dropout(0.5, name='drop9')(x) x = Convolution2D(classes, (1, 1), padding='valid', name='conv10')(x) x = PReLU(name='prelu_conv10')(x) x = GlobalAveragePooling2D()(x) out = Activation('softmax', name='loss')(x) model = Model(inputs, out, name='squeezenet') return model
def createCNN(networkType, numberOfChannels, frameSize, spectroWindowSize, spectroWindowShift, numberOfClasses): input_shape = (numberOfChannels, frameSize) spectrogramLayer = Spectrogram(input_shape=input_shape, n_dft=spectroWindowSize, n_hop=spectroWindowShift, padding='same', power_spectrogram=1.0, return_decibel_spectrogram=True) if networkType == NetworkType.CNN_PROPOSED_MASTER_THESIS: return createProposedNet(spectrogramLayer, numberOfClasses) elif networkType == NetworkType.CNN_SHALLOW: return True elif networkType == NetworkType.CNN_DEEP: return True elif networkType == NetworkType.CNN_PROPOSED_SMALL: return createProposedSmall(spectrogramLayer, numberOfClasses) elif networkType == NetworkType.CNN_RAW: return createRawNet(spectrogramLayer, numberOfClasses) else: raise ValueError("NetworkType not recognized! type: ", networkType)
def _test_mono_valid(): """Tests for - mono input - valid padding - shapes of output channel, n_freq, n_frame - save and load a model with it """ n_ch = 1 n_dft, len_hop, nsp_src = 512, 256, 8000 src = np.random.uniform(-1.0, 1.0, nsp_src) model = tensorflow.keras.models.Sequential() model.add( Spectrogram( n_dft=n_dft, n_hop=len_hop, padding='valid', power_spectrogram=1.0, return_decibel_spectrogram=False, image_data_format='default', input_shape=(nsp_src, n_ch) if image_data_format() == 'channels_last' else (n_ch, nsp_src), )) batch_stft_kapre = model.predict( src[np.newaxis, ..., np.newaxis] if image_data_format() == 'channels_last' else src[np.newaxis, np.newaxis, ...]) # check num_channel if image_data_format() == 'channels_last': assert batch_stft_kapre.shape[3] == n_ch assert batch_stft_kapre.shape[1] == n_dft // 2 + 1 assert batch_stft_kapre.shape[2] == _num_frame_valid( nsp_src, n_dft, len_hop) else: assert batch_stft_kapre.shape[1] == n_ch assert batch_stft_kapre.shape[2] == n_dft // 2 + 1 assert batch_stft_kapre.shape[3] == _num_frame_valid( nsp_src, n_dft, len_hop)
def MLP_model(input_shape, dropout=0.5, print_summary=False): # basis of the CNN_STFT is a Sequential network model = Sequential() # spectrogram creation using STFT model.add( Spectrogram(n_dft=128, n_hop=16, input_shape=input_shape, return_decibel_spectrogram=False, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) model.add(Normalization2D(str_axis='freq')) model.add(Flatten()) model.add(Dense(neurons_per_layer, activation='relu', input_shape=(784, ))) model.add(Dropout(0.2)) # custom number of hidden layers for each in range(n_hidden_layers - 1): model.add(Dense(neurons_per_layer, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(2)) # two classes only model.add(Activation('softmax')) if print_summary: print(model.summary()) # compile the model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # assign model and return return model
def get_network(args): x_in = Input( shape=args['shape'] ) # Expected 2D array: (audio_channel, audio_length), TODO flip the dimensions! x = Spectrogram(n_dft=args['n_dft'], n_hop=int(args['n_dft'] / 2))(x_in) for _ in range(args['conv']['n_blocks']): x = ResidualConvBlock(args['conv']['n_layers'], args['conv']['n_filters'], args['conv']['kernel_size'])(x) curr_shape = K.int_shape(x) if (curr_shape[1] > args['pool_size'][0] and curr_shape[2] > args['pool_size'][1]): x = MaxPooling2D(pool_size=args['pool_size'])(x) for _ in range(args['dense']['n_layers']): x = Dense(units=args['dense']['n_units'], use_bias=False)(x) x = BatchNormalization()(x) x = LeakyReLU(alpha=0.3)(x) x_out = Dense(units=args['n_genres'], activation='softmax')(x) model = Model(inputs=x_in, outputs=x_out) return model
def construct_tiny_L3_audio_model(): """ Constructs a model that implements a small L3 audio subnetwork Returns ------- model: L3 CNN model (Type: keras.models.Model) inputs: Model inputs (Type: list[keras.layers.Input]) outputs: Model outputs (Type: keras.layers.Layer) """ weight_decay = 1e-5 #### # Audio subnetwork #### n_dft = 512 n_win = 480 n_hop = n_win // 2 asr = 48000 audio_window_dur = 1 # INPUT x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32') # SPECTROGRAM PREPROCESSING y_a = Spectrogram(n_dft=n_dft, n_win=n_win, n_hop=n_hop, return_decibel_spectrogram=True, padding='valid')(x_a) y_a = Conv2D(10, (5, 5), padding='valid', strides=(1, 1), kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=(3, 3), strides=3)(y_a) y_a = Conv2D(10, (5, 5), padding='valid', strides=(1, 1), kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=(3, 3), strides=3)(y_a) y_a = Conv2D(10, (5, 5), padding='valid', strides=(1, 1), kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=(3, 3), strides=3)(y_a) y_a = Flatten(name='embedding')(y_a) m = Model(inputs=x_a, outputs=y_a) m.name = 'audio_model' return m, x_a, y_a
from keras.layers import MaxPooling2D from keras.layers import Flatten from keras.layers import Dense from kapre.time_frequency import Spectrogram from kapre.utils import Normalization2D from kapre.augmentation import AdditiveNoise import keras from keras import optimizers import tensorflow as tf from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split classifier = Sequential() classifier.add(Spectrogram(n_dft=512, n_hop=256,padding='same',input_shape=input_shape, power_spectrogram=2.0,return_decibel_spectrogram=False, trainable_kernel=False,image_data_format='default')) classifier.add(AdditiveNoise(power=0.2)) classifier.add(Normalization2D(str_axis='freq')) #Layer 1 classifier.add(Conv2D(24, (1, 1), input_shape = (7192,11, 1000), activation = 'relu')) keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None) classifier.add(MaxPooling2D(pool_size = (2, 2))) classifier.add(Dense(units = 128, activation = 'relu')) keras.layers.Dropout(0.5, noise_shape=None, seed=None) #Layer 2 classifier.add(Conv2D(48, (1,1), input_shape = (32, 32,24), activation = 'relu')) keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None) classifier.add(MaxPooling2D(pool_size = (2, 2))) classifier.add(Dense(units = 128, activation = 'relu')) keras.layers.Dropout(0.5, noise_shape=None, seed=None)
def model_1(win_length, filters, kernel_size_1, learning_rate, batch): kPs = int((win_length * 2000 / kSR)) kN = int(win_length) ini1 = tf.initializers.random_uniform(minval=-1, maxval=1) ini2 = tf.initializers.random_uniform(minval=0, maxval=1) x = Input(shape=(kContext * 2 + 1, win_length, 1), name='input', batch_shape=(batch, kContext * 2 + 1, win_length, 1)) conv = Conv1D(filters, kernel_size_1, strides=1, padding='same', kernel_initializer='lecun_uniform', input_shape=(win_length, 1)) activation_abs = Activation(K.abs) activation_sp = Activation('softplus') max_pooling = MaxPooling1D(pool_size=win_length // 64) conv_smoothing = Conv1D_local(filters, kernel_size_1 * 2, strides=1, padding='same', kernel_initializer='lecun_uniform') dense_sgn = Dense(kPs, activation='tanh', kernel_initializer=ini1, name='dense_l_sgn') dense_idx = Dense(kPs, activation='sigmoid', name='dense_l_idx') bi_rnn = Bidirectional(LSTM(filters * 2, activation='tanh', stateful=False, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat', name='birnn_in') bi_rnn1 = Bidirectional(LSTM(filters, activation='tanh', stateful=False, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat', name='birnn_1') bi_rnn2 = Bidirectional(LSTM(filters // 2, activation='linear', stateful=False, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat', name='birnn_2') bi_rnn3 = Bidirectional(LSTM(filters // 2, activation='linear', stateful=False, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat', name='birnn_3') convTensors = Conv1D_localTensor(filters, win_length, batch, strides=1, padding='same', name='convTensors') deconv = Conv1D_tied(1, kernel_size_1, conv, padding='same', name='deconv') velvet = VelvetNoise(kPs, batch, input_dim=filters, input_length=win_length, name='velvet') X = TimeDistributed(conv, name='conv')(x) X_abs = TimeDistributed(activation_abs, name='conv_activation')(X) M = TimeDistributed(conv_smoothing, name='conv_smoothing')(X_abs) M = TimeDistributed(activation_sp, name='conv_smoothing_activation')(M) P = X Z = TimeDistributed(max_pooling, name='max_pooling')(M) Z = Lambda(lambda inputs: tf.unstack( inputs, num=kContext * 2 + 1, axis=1, name='unstack2'))(Z) Z = Concatenate(name='concatenate')(Z) Z = bi_rnn(Z) Z1 = bi_rnn1(Z) Z1 = bi_rnn2(Z1) Z1 = SAAF(break_points=25, break_range=0.2, magnitude=100, order=2, tied_feamap=True, kernel_initializer='random_normal', name='saaf_1')(Z1) Z2 = bi_rnn3(Z) Z2 = SAAF(break_points=25, break_range=0.2, magnitude=100, order=2, tied_feamap=True, kernel_initializer='random_normal', name='saaf_2')(Z2) Z1 = Lambda((toPermuteDimensions), name='perm_1')(Z1) sgn = dense_sgn(Z1) idx = dense_idx(Z1) sgn = Lambda((toPermuteDimensions), name='perm_2')(sgn) idx = Lambda((toPermuteDimensions), name='perm_3')(idx) P = Lambda(lambda inputs: tf.unstack( inputs, num=kContext * 2 + 1, axis=1, name='unstack'))(P) V = Concatenate(name='concatenate2', axis=-1)([sgn, idx]) V = velvet(V) Y = Concatenate(name='concatenate3')([P[kContext], V]) Y = convTensors(Y) Y = SAAF(break_points=25, break_range=0.2, magnitude=100, order=2, tied_feamap=True, kernel_initializer='random_normal', name='saaf_out_conv')(Y) M_ = UpSampling1D(size=win_length // 64, name='up_sampling_naive')(Z2) Y = Multiply(name='phase_unpool_multiplication')([Y, M_]) Y_ = Dense(filters, activation='tanh', name='dense_in')(Y) Y_ = Dense(filters // 2, activation='tanh', name='dense_h1')(Y_) Y_ = Dense(filters // 2, activation='tanh', name='dense_h2')(Y_) Y_ = Dense(filters, activation='linear', name='dense_out')(Y_) Y_ = SAAF(break_points=25, break_range=0.2, magnitude=100, order=2, tied_feamap=True, kernel_initializer='random_normal', name='saaf_out')(Y_) Y = se_block_lstm(Y, filters, weight_decay=0., amplifying_ratio=16, idx=1) Y_ = se_block_lstm(Y_, filters, weight_decay=0., amplifying_ratio=16, idx=2) Y = Add(name='addition')([Y, Y_]) Y = deconv(Y) Y = Lambda((Window), name='waveform')(Y) loss_output = Spectrogram(n_dft=win_length, n_hop=win_length, input_shape=(1, win_length), return_decibel_spectrogram=True, power_spectrogram=2.0, trainable_kernel=False, name='spec') spec = Lambda((toPermuteDimensions), name='perm_spec')(Y) spec = loss_output(spec) model = Model(inputs=[x], outputs=[spec, Y]) model.compile(loss={ 'spec': 'mse', 'waveform': MAE_preEmphasis }, loss_weights={ 'spec': 0.0001, 'waveform': 1.0 }, optimizer=Adam(lr=learning_rate)) return model
def model_2(win_length, filters, kernel_size_1, learning_rate): kContext = 4 # past and subsequent frames x = Input(shape=(kContext * 2 + 1, win_length, 1), name='input') conv = Conv1D(filters, kernel_size_1, strides=1, padding='same', kernel_initializer='lecun_uniform', input_shape=(win_length, 1)) activation_abs = Activation(K.abs) activation_sp = Activation('softplus') max_pooling = MaxPooling1D(pool_size=win_length // 64) conv_smoothing = Conv1D_local(filters, kernel_size_1 * 2, strides=1, padding='same', kernel_initializer='lecun_uniform') bi_rnn = Bidirectional(LSTM(filters * 2, activation='tanh', stateful=False, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat', name='birnn_in') bi_rnn1 = Bidirectional(LSTM(filters, activation='tanh', stateful=False, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat', name='birnn_1') bi_rnn2 = Bidirectional(LSTM(filters // 2, activation='linear', stateful=False, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), merge_mode='concat', name='birnn_2') deconv = Conv1D_tied(1, kernel_size_1, conv, padding='same', name='deconv') X = TimeDistributed(conv, name='conv')(x) X_abs = TimeDistributed(activation_abs, name='conv_activation')(X) M = TimeDistributed(conv_smoothing, name='conv_smoothing')(X_abs) M = TimeDistributed(activation_sp, name='conv_smoothing_activation')(M) P = X Z = TimeDistributed(max_pooling, name='max_pooling')(M) Z = Lambda(lambda inputs: tf.unstack( inputs, num=kContext * 2 + 1, axis=1, name='unstack2'))(Z) Z = Concatenate(name='concatenate')(Z) Z = bi_rnn(Z) Z = bi_rnn1(Z) Z = bi_rnn2(Z) Z = SAAF(break_points=25, break_range=0.2, magnitude=100, order=2, tied_feamap=True, kernel_initializer='random_normal', name='saaf_1')(Z) M_ = UpSampling1D(size=win_length // 64, name='up_sampling_naive')(Z) P = Lambda(lambda inputs: tf.unstack( inputs, num=kContext * 2 + 1, axis=1, name='unstack'))(P) Y = Multiply(name='phase_unpool_multiplication')([P[kContext], M_]) Y_ = Dense(filters, activation='tanh', name='dense_in')(Y) Y_ = Dense(filters // 2, activation='tanh', name='dense_h1')(Y_) Y_ = Dense(filters // 2, activation='tanh', name='dense_h2')(Y_) Y_ = Dense(filters, activation='linear', name='dense_out')(Y_) Y_ = SAAF(break_points=25, break_range=0.2, magnitude=100, order=2, tied_feamap=True, kernel_initializer='random_normal', name='saaf_out')(Y_) Y_ = se_block(Y_, filters, weight_decay=0., amplifying_ratio=16, idx=1) Y = Add(name='addition')([Y, Y_]) Y = deconv(Y) Y = Lambda((Window), name='waveform')(Y) loss_output = Spectrogram(n_dft=win_length, n_hop=win_length, input_shape=(1, win_length), return_decibel_spectrogram=True, power_spectrogram=2.0, trainable_kernel=False, name='spec') spec = Lambda((toPermuteDimensions), name='perm_spec')(Y) spec = loss_output(spec) model = Model(inputs=[x], outputs=[spec, Y]) model.compile(loss={ 'spec': 'mse', 'waveform': MAE_preEmphasis }, loss_weights={ 'spec': 0.0001, 'waveform': 1.0 }, optimizer=Adam(lr=learning_rate)) return model
def add_mel_to_VGGish(content_weights_file_path_og, input_length, sr_hr, n_mels, hoplength, nfft, fmin, fmax, power_melgram): NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. EMBEDDING_SIZE = 128 # Size of embedding layer. pooling = 'avg' X = Input(shape=(NUM_FRAMES, NUM_BANDS, 1), name='nob') x = X x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv1')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x) # Block 2 x = Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x) # Block 3 x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_1')(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x) # Block 4 x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_1')(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x) if pooling == 'avg': x = GlobalAveragePooling2D()(x) elif pooling == 'max': x = GlobalMaxPooling2D()(x) model = Model(inputs=X, outputs=x) model.load_weights(content_weights_file_path_og) X = Input(shape=(1, input_length), name='input_1') x = X x = Spectrogram(n_dft=nfft, n_hop=hoplength, padding='same', return_decibel_spectrogram=True, trainable_kernel=False, name='stft')(x) x = Normalization2D(str_axis='freq')(x) no_input_layers = model.layers[1:] for layer in no_input_layers: x = layer(x) return Model(inputs=X, outputs=x)
def create_VGGish(input_length, sr_hr, n_mels, hoplength, nfft, fmin, fmax, power_melgram, pooling='avg'): X = Input(shape=(input_length, 1), name='input_1') x = X x = Reshape((1, input_length))(x) x = Spectrogram(n_dft=nfft, n_hop=hoplength, padding='same', return_decibel_spectrogram=True, trainable_kernel=False, name='stft')(x) x = Normalization2D(str_axis='freq')(x) x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv1')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x) x = Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_1')(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_1')(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_2')(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x) if pooling == 'avg': x = GlobalAveragePooling2D()(x) elif pooling == 'max': x = GlobalMaxPooling2D()(x) return X, x
seconds = sampling_rate * 5 folder = '/data/p253591/youtube_classification/data2/' with open(f'{folder}/test_data.p3', 'rb') as f: X_test, y_test = pickle.load(f) # normalize spectogram output slope = K.variable(value=1 / 40) intercept = K.variable(value=1) spectogram_model = Sequential() spectogram_model.add( Spectrogram(n_dft=512, n_hop=256, input_shape=(1, seconds), return_decibel_spectrogram=True, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) spectogram_model.add(Lambda(lambda x: slope * x + intercept)) for sample in numpy.random.permutation(len(X_test))[:4]: y_out = spectogram_model.predict(X_test[sample:sample + 1]) im = (y_out[0] + 1) / 2.0 im = im * [[[1, 0, 0]]] + (1 - im) * [[[0, 0, 1]]] imsave(f'spectrogram-{sample}-label-{y_test[sample]}.png', im) model = load_model(f'{folder}/model-2020-01-21-epoch-21.hd5', custom_objects={ 'Spectrogram': Spectrogram,
def construct_cnn_L3_orig_audio_model(): """ Constructs a model that replicates the audio subnetwork used in Look, Listen and Learn Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, . Returns ------- model: L3 CNN model (Type: keras.models.Model) inputs: Model inputs (Type: list[keras.layers.Input]) outputs: Model outputs (Type: keras.layers.Layer) """ weight_decay = 1e-5 #### # Audio subnetwork #### n_dft = 512 #n_win = 480 #n_hop = n_win//2 n_hop = 242 asr = 48000 audio_window_dur = 1 # INPUT x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32') # SPECTROGRAM PREPROCESSING # 257 x 199 x 1 y_a = Spectrogram( n_dft=n_dft, n_hop=n_hop, power_spectrogram=1.0, # n_win=n_win, return_decibel_spectrogram=False, padding='valid')(x_a) # Apply normalization from L3 paper y_a = Lambda(lambda x: tf.log(tf.maximum(x, 1e-12)) / 5.0)(y_a) # CONV BLOCK 1 n_filter_a_1 = 64 filt_size_a_1 = (3, 3) pool_size_a_1 = (2, 2) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a) # CONV BLOCK 2 n_filter_a_2 = 128 filt_size_a_2 = (3, 3) pool_size_a_2 = (2, 2) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a) # CONV BLOCK 3 n_filter_a_3 = 256 filt_size_a_3 = (3, 3) pool_size_a_3 = (2, 2) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a) # CONV BLOCK 4 n_filter_a_4 = 512 filt_size_a_4 = (3, 3) pool_size_a_4 = (32, 24) y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_4, filt_size_a_4, kernel_initializer='he_normal', name='audio_embedding_layer', padding='same', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_4)(y_a) y_a = Flatten()(y_a) m = Model(inputs=x_a, outputs=y_a) m.name = 'audio_model' return m, x_a, y_a
def CNN_model(input_shape, dropout=0.5, print_summary=False): # basis of the CNN_STFT is a Sequential network model = Sequential() # spectrogram creation using STFT model.add( Spectrogram(n_dft=128, n_hop=16, input_shape=input_shape, return_decibel_spectrogram=False, power_spectrogram=2.0, trainable_kernel=False, name='static_stft')) model.add(Normalization2D(str_axis='freq')) # Conv Block 1 model.add( Conv2D(filters=24, kernel_size=(12, 12), strides=(1, 1), name='conv1', border_mode='same')) model.add(BatchNormalization(axis=1)) model.add(Activation('relu')) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid', data_format='channels_last')) # Conv Block 2 model.add( Conv2D(filters=48, kernel_size=(8, 8), name='conv2', border_mode='same')) model.add(BatchNormalization(axis=1)) model.add(Activation('relu')) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid', data_format='channels_last')) # Conv Block 3 model.add( Conv2D(filters=96, kernel_size=(4, 4), name='conv3', border_mode='same')) model.add(BatchNormalization(axis=1)) model.add(Activation('relu')) model.add( MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid', data_format='channels_last')) model.add(Dropout(dropout)) # classificator model.add(Flatten()) model.add(Dense(2)) # two classes only model.add(Activation('softmax')) if print_summary: print(model.summary()) # compile the model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # assign model and return return model
def _construct_linear_audio_network(): """ Returns an uninitialized model object for a network with a linear spectrogram input (With 257 frequency bins) Returns ------- model : keras.models.Model Model object. """ weight_decay = 1e-5 n_dft = 512 n_hop = 242 asr = 48000 audio_window_dur = 1 # INPUT x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32') # SPECTROGRAM PREPROCESSING # 257 x 199 x 1 y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop, power_spectrogram=1.0, return_decibel_spectrogram=True, padding='valid')(x_a) y_a = BatchNormalization()(y_a) # CONV BLOCK 1 n_filter_a_1 = 64 filt_size_a_1 = (3, 3) pool_size_a_1 = (2, 2) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_1, filt_size_a_1, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_1, strides=2)(y_a) # CONV BLOCK 2 n_filter_a_2 = 128 filt_size_a_2 = (3, 3) pool_size_a_2 = (2, 2) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2)(y_a) # CONV BLOCK 3 n_filter_a_3 = 256 filt_size_a_3 = (3, 3) pool_size_a_3 = (2, 2) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2)(y_a) # CONV BLOCK 4 n_filter_a_4 = 512 filt_size_a_4 = (3, 3) pool_size_a_4 = (32, 24) y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same', kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(weight_decay))(y_a) y_a = BatchNormalization()(y_a) y_a = Activation('relu')(y_a) y_a = Conv2D(n_filter_a_4, filt_size_a_4, kernel_initializer='he_normal', name='audio_embedding_layer', padding='same', kernel_regularizer=regularizers.l2(weight_decay))(y_a) m = Model(inputs=x_a, outputs=y_a) return m
def raw_vgg(args, input_length=12000 * 29, tf='melgram', normalize=None, decibel=False, last_layer=True, sr=None): ''' when length = 12000*29 and 512/256 dft/hop, melgram size: (n_mels, 1360) ''' assert tf in ('stft', 'melgram') assert normalize in (None, False, 'no', 0, 0.0, 'batch', 'data_sample', 'time', 'freq', 'channel') assert isinstance(decibel, bool) if sr is None: sr = SR # assumes 12000 conv_until = args.conv_until # for intermediate layer outputting. trainable_kernel = args.trainable_kernel model = Sequential() if tf == 'stft': # decode args model.add( Spectrogram(n_dft=512, n_hop=256, power_spectrogram=2.0, trainable_kernel=trainable_kernel, return_decibel_spectrogram=decibel, input_shape=(1, input_length))) poolings = [(2, 4), (4, 4), (4, 5), (2, 4), (4, 4)] elif tf == 'melgram': # decode args fmin = args.fmin fmax = args.fmax if fmax == 0.0: fmax = sr / 2 n_mels = args.n_mels trainable_fb = args.trainable_fb # pdb.set_trace() model.add( Melspectrogram(n_dft=512, n_hop=256, power_melgram=2.0, input_shape=(1, input_length), trainable_kernel=trainable_kernel, trainable_fb=trainable_fb, return_decibel_melgram=decibel, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax, name='melgram')) if n_mels >= 256: poolings = [(2, 4), (4, 4), (4, 5), (2, 4), (4, 4)] elif n_mels >= 128: poolings = [(2, 4), (4, 4), (2, 5), (2, 4), (4, 4)] elif n_mels >= 96: poolings = [(2, 4), (3, 4), (2, 5), (2, 4), (4, 4)] elif n_mels >= 72: poolings = [(2, 4), (3, 4), (2, 5), (2, 4), (3, 4)] elif n_mels >= 64: poolings = [(2, 4), (2, 4), (2, 5), (2, 4), (4, 4)] elif n_mels >= 48: poolings = [(2, 4), (2, 4), (2, 5), (2, 4), (3, 4)] elif n_mels >= 32: poolings = [(2, 4), (2, 4), (2, 5), (2, 4), (2, 4)] elif n_mels >= 24: poolings = [(2, 4), (2, 4), (2, 5), (3, 4), (1, 4)] elif n_mels >= 18: poolings = [(2, 4), (1, 4), (3, 5), (1, 4), (3, 4)] elif n_mels >= 18: poolings = [(2, 4), (1, 4), (3, 5), (1, 4), (3, 4)] elif n_mels >= 16: poolings = [(2, 4), (2, 4), (2, 5), (2, 4), (1, 4)] elif n_mels >= 12: poolings = [(2, 4), (1, 4), (2, 5), (3, 4), (1, 4)] elif n_mels >= 8: poolings = [(2, 4), (1, 4), (2, 5), (2, 4), (1, 4)] elif n_mels >= 6: poolings = [(2, 4), (1, 4), (3, 5), (1, 4), (1, 4)] elif n_mels >= 4: poolings = [(2, 4), (1, 4), (2, 5), (1, 4), (1, 4)] elif n_mels >= 2: poolings = [(2, 4), (1, 4), (1, 5), (1, 4), (1, 4)] else: # n_mels == 1 poolings = [(1, 4), (1, 4), (1, 5), (1, 4), (1, 4)] else: raise RuntimeError('choose between stft or melgram, not %s' % str(tf)) if normalize in ('batch', 'data_sample', 'time', 'freq', 'channel'): # pdb.set_trace() model.add(Normalization2D(normalize)) args = [ 5, [32, 32, 32, 32, 32], 1.0, [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3)], poolings, 0.0, model.output_shape[1:] ] model.add( get_convBNeluMPdrop(*args, num_nin_layers=1, conv_until=conv_until)) if conv_until != 4: model.add(GlobalAveragePooling2D()) else: model.add(Flatten()) if last_layer: model.add(Dense(50, activation='sigmoid')) return model