def model_cnn_spec(timewindow, nfft, hop_length=4):
    """build very base CNN model on top of spectrogram.
    :returns: keras model object
    """
    # std_dev_input = 0.001
    inputs = keras.Input(shape=(timewindow, 3))
    x = STFT(n_fft=nfft,
             window_name=None,
             pad_end=False,
             hop_length=hop_length,
             input_data_format='channels_last',
             output_data_format='channels_last',)(inputs)
    x = Magnitude()(x)
    x = MagnitudeToDecibel()(x)
    x = MaxABSScaler()(x)
    #x = tf.keras.layers.Lambda(lambda image: tf.image.resize(image, (60,60)))(x)
    x = tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.activations.relu(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2,2))(x)
    x = tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.activations.relu(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2,2))(x)
    x = tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.activations.relu(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2,2))(x)
    x = tf.keras.layers.Flatten()(x)
    x = layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(80, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(3, activation="softmax")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model
Пример #2
0
def test_spectrogram_correctness_more(data_format, window_name):
    def _get_stft_model(following_layer=None):
        # compute with kapre
        stft_model = tensorflow.keras.models.Sequential()
        stft_model.add(
            STFT(
                n_fft=n_fft,
                win_length=win_length,
                hop_length=hop_length,
                window_name=window_name,
                pad_end=False,
                input_data_format=data_format,
                output_data_format=data_format,
                input_shape=input_shape,
                name='stft',
            )
        )
        if following_layer is not None:
            stft_model.add(following_layer)
        return stft_model

    n_fft = 512
    hop_length = 256
    n_ch = 2

    src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch)
    win_length = n_fft  # test with x2
    # compute with librosa
    S_ref = librosa.core.stft(
        src_mono,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        center=False,
        window=window_name.replace('_window', '') if window_name else 'hann',
    ).T  # (time, freq)

    S_ref = np.expand_dims(S_ref, axis=2)  # time, freq, ch=1
    S_ref = np.tile(S_ref, [1, 1, n_ch])  # time, freq, ch=n_ch
    if data_format == 'channels_first':
        S_ref = np.transpose(S_ref, (2, 0, 1))  # ch, time, freq

    stft_model = _get_stft_model()

    S_complex = stft_model.predict(batch_src)[0]  # 3d representation
    allclose_complex_numbers(S_ref, S_complex)

    # test Magnitude()
    stft_mag_model = _get_stft_model(Magnitude())
    S = stft_mag_model.predict(batch_src)[0]  # 3d representation
    np.testing.assert_allclose(np.abs(S_ref), S, atol=2e-4)

    # # test Phase()
    stft_phase_model = _get_stft_model(Phase())
    S = stft_phase_model.predict(batch_src)[0]  # 3d representation
    allclose_phase(np.angle(S_complex), S)
Пример #3
0
    def __init__(
        self,
        sample_rate=16000,
        n_fft=512,
        win_length=None,
        hop_length=None,
        pad=0,
        power=2,
        normalized=False,
        n_harmonic=6,
        semitone_scale=2,
        bw_Q=1.0,
        learn_bw=None,
    ):
        super(HarmonicSTFT, self).__init__()

        # Parameters
        self.sample_rate = sample_rate
        self.n_harmonic = n_harmonic
        self.bw_alpha = 0.1079
        self.bw_beta = 24.7
        self.win_length = win_length
        self.hop_length = hop_length
        self.n_fft = n_fft
        self.fft_bins = tf.linspace(0, self.sample_rate // 2, self.n_fft // 2 + 1)
        
        # Spectrogram
        self.stft = STFT(
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=n_fft//2,
            window_name="hann_window",
            input_shape=(80000, 1),
            pad_begin=True,
        )
        self.magnitude = Magnitude()
        self.to_decibel = MagnitudeToDecibel()
        self.zero = tf.zeros([1,])

        # Initialize the filterbank. Equally spaced in MIDI scale.
        harmonic_hz, self.level = initialize_filterbank(
            sample_rate, n_harmonic, semitone_scale
        )

        # Center frequncies to tensor
        self.f0 = tf.constant(harmonic_hz, dtype="float32")
        
        # Bandwidth parameters
        if learn_bw == 'only_Q':
            self.bw_Q = tf.Variable(np.array([bw_Q]), dtype="float32", trainable=True)
        elif learn_bw == 'fix':
            self.bw_Q = tf.constant(np.array([bw_Q]), dtype="float32")
Пример #4
0
def test_spectrogram_tflite_correctness(
    n_fft, hop_length, n_ch, data_format, batch_size, win_length, pad_end
):
    def _get_stft_model(following_layer=None, tflite_compatible=False):
        # compute with kapre
        stft_model = tensorflow.keras.models.Sequential()
        if tflite_compatible:
            stft_model.add(
                STFTTflite(
                    n_fft=n_fft,
                    win_length=win_length,
                    hop_length=hop_length,
                    window_name=None,
                    pad_end=pad_end,
                    input_data_format=data_format,
                    output_data_format=data_format,
                    input_shape=input_shape,
                    name='stft',
                )
            )
        else:
            stft_model.add(
                STFT(
                    n_fft=n_fft,
                    win_length=win_length,
                    hop_length=hop_length,
                    window_name=None,
                    pad_end=pad_end,
                    input_data_format=data_format,
                    output_data_format=data_format,
                    input_shape=input_shape,
                    name='stft',
                )
            )
        if following_layer is not None:
            stft_model.add(following_layer)
        return stft_model

    src_mono, batch_src, input_shape = get_audio(
        data_format=data_format, n_ch=n_ch, batch_size=batch_size
    )
    # tflite requires a known batch size
    batch_size = batch_src.shape[0]

    stft_model_tflite = _get_stft_model(tflite_compatible=True)
    stft_model = _get_stft_model(tflite_compatible=False)

    # test STFT()
    S_complex_tflite = predict_using_tflite(stft_model_tflite, batch_src)  # predict using tflite
    # (batch, time, freq, chan, re/imag) - convert to complex number:
    S_complex_tflite = tf.complex(
        S_complex_tflite[..., 0], S_complex_tflite[..., 1]
    )  # (batch,time,freq,chan)
    S_complex = stft_model.predict(batch_src)  # predict using tf model
    allclose_complex_numbers(S_complex, S_complex_tflite)

    # test Magnitude()
    stft_mag_model_tflite = _get_stft_model(MagnitudeTflite(), tflite_compatible=True)
    stft_mag_model = _get_stft_model(Magnitude(), tflite_compatible=False)
    S_lite = predict_using_tflite(stft_mag_model_tflite, batch_src)  # predict using tflite
    S = stft_mag_model.predict(batch_src)  # predict using tf model
    np.testing.assert_allclose(S, S_lite, atol=1e-4)

    # # test approx Phase() same for tflite and non-tflite
    stft_approx_phase_model_lite = _get_stft_model(
        PhaseTflite(approx_atan_accuracy=500), tflite_compatible=True
    )
    stft_approx_phase_model = _get_stft_model(
        Phase(approx_atan_accuracy=500), tflite_compatible=False
    )
    S_approx_phase_lite = predict_using_tflite(
        stft_approx_phase_model_lite, batch_src
    )  # predict using tflite
    S_approx_phase = stft_approx_phase_model.predict(
        batch_src, batch_size=batch_size
    )  # predict using tf model
    assert_approx_phase(S_approx_phase_lite, S_approx_phase, atol=1e-2, acceptable_fail_ratio=0.01)

    # # test accuracy of approx Phase()
    stft_phase_model = _get_stft_model(Phase(), tflite_compatible=False)
    S_phase = stft_phase_model.predict(batch_src, batch_size=batch_size)  # predict using tf model
    assert_approx_phase(S_approx_phase_lite, S_phase, atol=1e-2, acceptable_fail_ratio=0.01)
def seismo_performer_with_spec(
        maxlen=400,
        nfft=64,
        hop_length=16,
        patch_size_1=22,
        patch_size_2=3,
        num_channels=3,
        num_patches=11,
        d_model=48,
        num_heads=2,
        ff_dim_factor=2,
        layers_depth=2,
        num_classes=3,
        drop_out_rate=0.1):
    """
    The model for P/S/N waves classification using ViT approach
    with converted raw signal to spectrogram and the treat it as input to PERFORMER
    Parameters:
    :maxlen: maximum samples of waveforms
    :nfft: number of FFTs in short-time Fourier transform
    :hop_length: Hop length in sample between analysis windows
    :patch_size_1: patch size for first dimention (depends on nfft/hop_length)
    :patch_size_2: patch size for second dimention (depends on nfft/hop_length)
    :num_channels: number of channels (usually it's equal to 3)
    :num_patches: resulting number of patches (FIX manual setup!)
    :d_model: Embedding size for each token
    :num_heads: Number of attention heads
    :ff_dim_factor: Hidden layer size in feed forward network inside transformer
                    ff_dim = d_model * ff_dim_factor
    :layers_depth: The number of transformer blocks
    :num_classes: The number of classes to predict
    :returns: Keras model object
    """
    num_patches = num_patches
    ff_dim = d_model * ff_dim_factor
    inputs = layers.Input(shape=(maxlen, num_channels))
    # do transform
    x = STFT(n_fft=nfft,
             window_name=None,
             pad_end=False,
             hop_length=hop_length,
             input_data_format='channels_last',
             output_data_format='channels_last',)(inputs)
    x = Magnitude()(x)
    x = MagnitudeToDecibel()(x)
    # custom normalization
    x = MaxABSScaler()(x)
    # patch the input channel
    x = Rearrange3d(p1=patch_size_1,p2=patch_size_2)(x)
    # embedding
    x = tf.keras.layers.Dense(d_model)(x)
    # add cls token
    x = ClsToken(d_model)(x)
    # positional embeddings
    x = PosEmbeding2(num_patches=num_patches + 1, projection_dim=d_model)(x)
    # encoder block
    for i in range(layers_depth):
        x = PerformerBlock(d_model, num_heads, ff_dim, rate=drop_out_rate)(x)
    # to MLP head
    x = tf.keras.layers.Lambda(lambda x: x[:, 0])(x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    # MLP-head
    x = layers.Dropout(drop_out_rate)(x)
    x = tf.keras.layers.Dense(d_model*ff_dim_factor, activation='gelu')(x)
    x = layers.Dropout(drop_out_rate)(x)
    x = tf.keras.layers.Dense(d_model, activation='gelu')(x)
    x = layers.Dropout(drop_out_rate)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model