def model_cnn_spec(timewindow, nfft, hop_length=4): """build very base CNN model on top of spectrogram. :returns: keras model object """ # std_dev_input = 0.001 inputs = keras.Input(shape=(timewindow, 3)) x = STFT(n_fft=nfft, window_name=None, pad_end=False, hop_length=hop_length, input_data_format='channels_last', output_data_format='channels_last',)(inputs) x = Magnitude()(x) x = MagnitudeToDecibel()(x) x = MaxABSScaler()(x) #x = tf.keras.layers.Lambda(lambda image: tf.image.resize(image, (60,60)))(x) x = tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), padding="same")(x) x = tf.keras.layers.BatchNormalization()(x) x = tf.keras.activations.relu(x) x = tf.keras.layers.MaxPooling2D(pool_size=(2,2))(x) x = tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), padding="same")(x) x = tf.keras.layers.BatchNormalization()(x) x = tf.keras.activations.relu(x) x = tf.keras.layers.MaxPooling2D(pool_size=(2,2))(x) x = tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), padding="same")(x) x = tf.keras.layers.BatchNormalization()(x) x = tf.keras.activations.relu(x) x = tf.keras.layers.MaxPooling2D(pool_size=(2,2))(x) x = tf.keras.layers.Flatten()(x) x = layers.Dropout(0.5)(x) x = tf.keras.layers.Dense(80, activation="relu")(x) x = layers.Dropout(0.5)(x) outputs = tf.keras.layers.Dense(3, activation="softmax")(x) model = tf.keras.Model(inputs=inputs, outputs=outputs) return model
def test_spectrogram_correctness_more(data_format, window_name): def _get_stft_model(following_layer=None): # compute with kapre stft_model = tensorflow.keras.models.Sequential() stft_model.add( STFT( n_fft=n_fft, win_length=win_length, hop_length=hop_length, window_name=window_name, pad_end=False, input_data_format=data_format, output_data_format=data_format, input_shape=input_shape, name='stft', ) ) if following_layer is not None: stft_model.add(following_layer) return stft_model n_fft = 512 hop_length = 256 n_ch = 2 src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch) win_length = n_fft # test with x2 # compute with librosa S_ref = librosa.core.stft( src_mono, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, window=window_name.replace('_window', '') if window_name else 'hann', ).T # (time, freq) S_ref = np.expand_dims(S_ref, axis=2) # time, freq, ch=1 S_ref = np.tile(S_ref, [1, 1, n_ch]) # time, freq, ch=n_ch if data_format == 'channels_first': S_ref = np.transpose(S_ref, (2, 0, 1)) # ch, time, freq stft_model = _get_stft_model() S_complex = stft_model.predict(batch_src)[0] # 3d representation allclose_complex_numbers(S_ref, S_complex) # test Magnitude() stft_mag_model = _get_stft_model(Magnitude()) S = stft_mag_model.predict(batch_src)[0] # 3d representation np.testing.assert_allclose(np.abs(S_ref), S, atol=2e-4) # # test Phase() stft_phase_model = _get_stft_model(Phase()) S = stft_phase_model.predict(batch_src)[0] # 3d representation allclose_phase(np.angle(S_complex), S)
def __init__( self, sample_rate=16000, n_fft=512, win_length=None, hop_length=None, pad=0, power=2, normalized=False, n_harmonic=6, semitone_scale=2, bw_Q=1.0, learn_bw=None, ): super(HarmonicSTFT, self).__init__() # Parameters self.sample_rate = sample_rate self.n_harmonic = n_harmonic self.bw_alpha = 0.1079 self.bw_beta = 24.7 self.win_length = win_length self.hop_length = hop_length self.n_fft = n_fft self.fft_bins = tf.linspace(0, self.sample_rate // 2, self.n_fft // 2 + 1) # Spectrogram self.stft = STFT( n_fft=self.n_fft, win_length=self.win_length, hop_length=n_fft//2, window_name="hann_window", input_shape=(80000, 1), pad_begin=True, ) self.magnitude = Magnitude() self.to_decibel = MagnitudeToDecibel() = tf.zeros([1,]) # Initialize the filterbank. Equally spaced in MIDI scale. harmonic_hz, self.level = initialize_filterbank( sample_rate, n_harmonic, semitone_scale ) # Center frequncies to tensor self.f0 = tf.constant(harmonic_hz, dtype="float32") # Bandwidth parameters if learn_bw == 'only_Q': self.bw_Q = tf.Variable(np.array([bw_Q]), dtype="float32", trainable=True) elif learn_bw == 'fix': self.bw_Q = tf.constant(np.array([bw_Q]), dtype="float32")
def test_spectrogram_tflite_correctness( n_fft, hop_length, n_ch, data_format, batch_size, win_length, pad_end ): def _get_stft_model(following_layer=None, tflite_compatible=False): # compute with kapre stft_model = tensorflow.keras.models.Sequential() if tflite_compatible: stft_model.add( STFTTflite( n_fft=n_fft, win_length=win_length, hop_length=hop_length, window_name=None, pad_end=pad_end, input_data_format=data_format, output_data_format=data_format, input_shape=input_shape, name='stft', ) ) else: stft_model.add( STFT( n_fft=n_fft, win_length=win_length, hop_length=hop_length, window_name=None, pad_end=pad_end, input_data_format=data_format, output_data_format=data_format, input_shape=input_shape, name='stft', ) ) if following_layer is not None: stft_model.add(following_layer) return stft_model src_mono, batch_src, input_shape = get_audio( data_format=data_format, n_ch=n_ch, batch_size=batch_size ) # tflite requires a known batch size batch_size = batch_src.shape[0] stft_model_tflite = _get_stft_model(tflite_compatible=True) stft_model = _get_stft_model(tflite_compatible=False) # test STFT() S_complex_tflite = predict_using_tflite(stft_model_tflite, batch_src) # predict using tflite # (batch, time, freq, chan, re/imag) - convert to complex number: S_complex_tflite = tf.complex( S_complex_tflite[..., 0], S_complex_tflite[..., 1] ) # (batch,time,freq,chan) S_complex = stft_model.predict(batch_src) # predict using tf model allclose_complex_numbers(S_complex, S_complex_tflite) # test Magnitude() stft_mag_model_tflite = _get_stft_model(MagnitudeTflite(), tflite_compatible=True) stft_mag_model = _get_stft_model(Magnitude(), tflite_compatible=False) S_lite = predict_using_tflite(stft_mag_model_tflite, batch_src) # predict using tflite S = stft_mag_model.predict(batch_src) # predict using tf model np.testing.assert_allclose(S, S_lite, atol=1e-4) # # test approx Phase() same for tflite and non-tflite stft_approx_phase_model_lite = _get_stft_model( PhaseTflite(approx_atan_accuracy=500), tflite_compatible=True ) stft_approx_phase_model = _get_stft_model( Phase(approx_atan_accuracy=500), tflite_compatible=False ) S_approx_phase_lite = predict_using_tflite( stft_approx_phase_model_lite, batch_src ) # predict using tflite S_approx_phase = stft_approx_phase_model.predict( batch_src, batch_size=batch_size ) # predict using tf model assert_approx_phase(S_approx_phase_lite, S_approx_phase, atol=1e-2, acceptable_fail_ratio=0.01) # # test accuracy of approx Phase() stft_phase_model = _get_stft_model(Phase(), tflite_compatible=False) S_phase = stft_phase_model.predict(batch_src, batch_size=batch_size) # predict using tf model assert_approx_phase(S_approx_phase_lite, S_phase, atol=1e-2, acceptable_fail_ratio=0.01)
def seismo_performer_with_spec( maxlen=400, nfft=64, hop_length=16, patch_size_1=22, patch_size_2=3, num_channels=3, num_patches=11, d_model=48, num_heads=2, ff_dim_factor=2, layers_depth=2, num_classes=3, drop_out_rate=0.1): """ The model for P/S/N waves classification using ViT approach with converted raw signal to spectrogram and the treat it as input to PERFORMER Parameters: :maxlen: maximum samples of waveforms :nfft: number of FFTs in short-time Fourier transform :hop_length: Hop length in sample between analysis windows :patch_size_1: patch size for first dimention (depends on nfft/hop_length) :patch_size_2: patch size for second dimention (depends on nfft/hop_length) :num_channels: number of channels (usually it's equal to 3) :num_patches: resulting number of patches (FIX manual setup!) :d_model: Embedding size for each token :num_heads: Number of attention heads :ff_dim_factor: Hidden layer size in feed forward network inside transformer ff_dim = d_model * ff_dim_factor :layers_depth: The number of transformer blocks :num_classes: The number of classes to predict :returns: Keras model object """ num_patches = num_patches ff_dim = d_model * ff_dim_factor inputs = layers.Input(shape=(maxlen, num_channels)) # do transform x = STFT(n_fft=nfft, window_name=None, pad_end=False, hop_length=hop_length, input_data_format='channels_last', output_data_format='channels_last',)(inputs) x = Magnitude()(x) x = MagnitudeToDecibel()(x) # custom normalization x = MaxABSScaler()(x) # patch the input channel x = Rearrange3d(p1=patch_size_1,p2=patch_size_2)(x) # embedding x = tf.keras.layers.Dense(d_model)(x) # add cls token x = ClsToken(d_model)(x) # positional embeddings x = PosEmbeding2(num_patches=num_patches + 1, projection_dim=d_model)(x) # encoder block for i in range(layers_depth): x = PerformerBlock(d_model, num_heads, ff_dim, rate=drop_out_rate)(x) # to MLP head x = tf.keras.layers.Lambda(lambda x: x[:, 0])(x) x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x) # MLP-head x = layers.Dropout(drop_out_rate)(x) x = tf.keras.layers.Dense(d_model*ff_dim_factor, activation='gelu')(x) x = layers.Dropout(drop_out_rate)(x) x = tf.keras.layers.Dense(d_model, activation='gelu')(x) x = layers.Dropout(drop_out_rate)(x) outputs = layers.Dense(num_classes, activation='softmax')(x) model = keras.Model(inputs=inputs, outputs=outputs) return model