def test_convert_to_sequences(): feature_extractor = FeatureExtractor(pad_mode="constant") audio_rep = np.zeros((feature_extractor.sequence_frames * 2, 2)) # (64, 2) frames = feature_extractor.convert_to_sequences(audio_rep) # (3, 32, 2) assert frames.shape == (3, 32, 2) # Check that ignore last samples audio_rep = np.zeros( (feature_extractor.sequence_frames * 2 + 10, 2)) # (74, 2) frames = feature_extractor.convert_to_sequences(audio_rep) # (3, 32, 2) assert frames.shape == (3, 32, 2) feature_extractor = FeatureExtractor(sequence_hop_time=-1, pad_mode="constant") frames = feature_extractor.convert_to_sequences(audio_rep) # (1, 74, 2) assert frames.shape == (1, 32, 2) feature_extractor = FeatureExtractor(sequence_hop_time=-1, sequence_time=-1, pad_mode="constant") frames = feature_extractor.convert_to_sequences(audio_rep) # (1, 74, 2) assert frames.shape == (1, 74, 2) # Test it together with pad_audio feature_extractor = FeatureExtractor(pad_mode="constant") n_frames = feature_extractor.sequence_frames n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win audio = np.zeros(n_samples) audio_pad = feature_extractor.pad_audio(audio) stft = librosa.core.stft( audio_pad, n_fft=1024, hop_length=feature_extractor.audio_hop, win_length=feature_extractor.audio_win, center=False, ) spectrogram = np.abs(stft)**2 spectrogram = spectrogram.T frames = feature_extractor.convert_to_sequences(spectrogram) assert frames.shape == (1, feature_extractor.sequence_frames, 513) n_frames = 2 * feature_extractor.sequence_frames n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win audio = np.zeros(n_samples) audio_pad = feature_extractor.pad_audio(audio) stft = librosa.core.stft( audio_pad, n_fft=1024, hop_length=feature_extractor.audio_hop, win_length=feature_extractor.audio_win, center=False, ) spectrogram = np.abs(stft)**2 spectrogram = spectrogram.T frames = feature_extractor.convert_to_sequences(spectrogram) assert frames.shape == (3, feature_extractor.sequence_frames, 513)
def test_pad_audio(): # No sequence slicing # sequence_frames = 32 feature_extractor = FeatureExtractor(sequence_hop_time=-1, pad_mode='constant') n_frames = feature_extractor.sequence_frames - 1 n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win audio = np.ones(n_samples) audio_pad = feature_extractor.pad_audio(audio) assert len( audio_pad ) == feature_extractor.sequence_frames * feature_extractor.audio_hop + feature_extractor.audio_win # No sequence slicing, audio larger than sequence_time n_frames = feature_extractor.sequence_frames + 1 n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win audio = np.ones(n_samples) audio_pad = feature_extractor.pad_audio(audio) assert len( audio_pad ) == feature_extractor.sequence_frames * feature_extractor.audio_hop + feature_extractor.audio_win # Sequence slicing, audio shorter than one sequence # sequence_frames = 32 # sequence_hop = 16 feature_extractor = FeatureExtractor(pad_mode='constant') n_frames = feature_extractor.sequence_frames - 1 n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win audio = np.ones(n_samples) audio_pad = feature_extractor.pad_audio(audio) assert len( audio_pad ) == feature_extractor.sequence_frames * feature_extractor.audio_hop + feature_extractor.audio_win # Sequence slicing, audio larger than one sequence # sequence_frames = 32 # sequence_hop = 16 feature_extractor = FeatureExtractor(pad_mode='constant') n_frames = feature_extractor.sequence_frames + 1 n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win audio = np.ones(n_samples) audio_pad = feature_extractor.pad_audio(audio) assert len(audio_pad) == ( feature_extractor.sequence_frames + feature_extractor.sequence_hop ) * feature_extractor.audio_hop + feature_extractor.audio_win # Sequence slicing, audio length equal to two sequences feature_extractor = FeatureExtractor(pad_mode='constant') n_frames = 2 * feature_extractor.sequence_frames n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win audio = np.ones(n_samples) audio_pad = feature_extractor.pad_audio(audio) assert len(audio_pad) == ( 2 * feature_extractor.sequence_frames ) * feature_extractor.audio_hop + feature_extractor.audio_win