def test_import_tracks(self, corpus): importing_tracks = [ tracks.FileTrack('a', '/some/path.wav'), tracks.FileTrack('b', '/some/other/path.wav'), tracks.FileTrack('existing_file', '/some/otherer/path.wav'), ] idx_mapping = corpus.import_tracks(importing_tracks) assert corpus.num_tracks == 4 assert 'a' in corpus.tracks.keys() assert corpus.tracks['a'].path == '/some/path.wav' assert 'b' in corpus.tracks.keys() assert corpus.tracks['b'].path == '/some/other/path.wav' assert 'existing_file_1' in corpus.tracks.keys() assert corpus.tracks[ 'existing_file_1'].path == '/some/otherer/path.wav' assert len(idx_mapping) == 3 assert 'a' in idx_mapping['a'].idx assert 'b' in idx_mapping['b'].idx assert idx_mapping['existing_file'].idx == 'existing_file_1'
def test_exports_wavs_from_container_tracks(self, writer, tmpdir): path = tmpdir.strpath container_ds_path = os.path.join(path, 'container_ds') out_path = os.path.join(path, 'export') ds = resources.create_dataset() ds.relocate_audio_to_single_container(container_ds_path) writer.save(ds, out_path) print(os.listdir(out_path)) track_path = os.path.join(out_path, 'audio', 'wav-1.wav') track = tracks.FileTrack(None, track_path) assert os.path.isfile(track_path) assert track.duration == pytest.approx(2.5951875) assert np.allclose(track.read_samples(), ds.tracks['wav-1'].read_samples(), atol=1e-05) track_path = os.path.join(out_path, 'audio', 'wav_2.wav') track = tracks.FileTrack(None, track_path) assert os.path.isfile(track_path) assert track.duration == pytest.approx(2.5951875) track_path = os.path.join(out_path, 'audio', 'wav_3.wav') track = tracks.FileTrack(None, track_path) assert os.path.isfile(track_path) assert track.duration == pytest.approx(2.5951875) track_path = os.path.join(out_path, 'audio', 'wav_4.wav') track = tracks.FileTrack(None, track_path) assert os.path.isfile(track_path) assert track.duration == pytest.approx(2.5951875)
def test_encode_label_ends_at_utterance_end(self): track = tracks.FileTrack('file1', resources.sample_wav_file('med_len.wav')) utt = tracks.Utterance('utt1', track, start=3, end=14) ll = annotations.LabelList(labels=[ annotations.Label('speech', 0, 4), annotations.Label('music', 4, 9), annotations.Label('speech', 9, float('inf')), ]) utt.set_label_list(ll) enc = encoding.FrameHotEncoder(['music', 'speech', 'noise'], 'default', frame_settings=units.FrameSettings( 32000, 16000), sr=16000) actual = enc.encode_utterance(utt) expected = np.array([ [0, 1, 0], [0, 1, 0], [0, 1, 0], [1, 1, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0], ]).astype(np.float32) assert np.array_equal(expected, actual)
def relocate_audio_to_wav_files(self, target_path): """ Copies every track to its own wav file in the given folder. Every track will be stored at ``target_path/track_id.wav``. """ if not os.path.isdir(target_path): os.makedirs(target_path) new_tracks = {} # First create a new container track for all existing tracks for track in self.tracks.values(): track_path = os.path.join(target_path, '{}.wav'.format(track.idx)) sr = track.sampling_rate samples = track.read_samples() audio.write_wav(track_path, samples, sr=sr) new_track = tracks.FileTrack(track.idx, track_path) new_tracks[track.idx] = new_track # Update track list of corpus self._tracks = new_tracks # Update utterances to point to new tracks for utterance in self.utterances.values(): new_track = self.tracks[utterance.track.idx] utterance.track = new_track
def test_process_track_online(self, processor, tmpdir): wav_path = os.path.join(tmpdir.strpath, 'file.wav') wav_content = np.random.random(174) librosa.output.write_wav(wav_path, wav_content, 16000) track = tracks.FileTrack('idx', wav_path) chunks = list( processor.process_track_online(track, frame_size=20, hop_size=10, chunk_size=8)) assert len(chunks) == 3 assert np.allclose(chunks[0][0], wav_content[0:20], atol=0.0001) assert np.allclose(chunks[2][-1], np.pad(wav_content[160:], (0, 6), mode='constant'), atol=0.0001) assert chunks[0].dtype == np.float32 assert processor.called_with_sr == [16000, 16000, 16000] assert processor.called_with_offset == [0, 8, 16] assert processor.called_with_last == [False, False, True] assert processor.called_with_utterance == [None, None, None] assert processor.called_with_corpus == [None, None, None]
def test_does_utt_match_target_format_with_invalid_format_returns_false(self): file_path = resources.get_resource_path(('audio_formats', 'mp3_2_44_1k_16b.mp3')) track = tracks.FileTrack('t', file_path) utt = tracks.Utterance('u', track) c = conversion.WavAudioFileConverter() assert not c._does_utt_match_target_format(utt)
def test_does_utt_match_target_format_returns_true(self): file_path = resources.sample_wav_file('wav_1.wav') track = tracks.FileTrack('t', file_path) utt = tracks.Utterance('u', track) c = conversion.WavAudioFileConverter() assert c._does_utt_match_target_format(utt)
def test_read_samples(self): path = resources.sample_wav_file('wav_1.wav') track = tracks.FileTrack('wav', path) issuer = issuers.Issuer('toni') utt = tracks.Utterance('t', track, issuer=issuer, start=1.0, end=2.30) l1 = annotations.Label('a', 0.15, 0.448) l2 = annotations.Label('a', 0.5, 0.73) ll = annotations.LabelList(labels=[l1, l2]) utt.set_label_list(ll) expected, __ = librosa.core.load(path, sr=None, offset=1.15, duration=0.298) assert np.array_equal(l1.read_samples(), expected) expected, __ = librosa.core.load(path, sr=None, offset=1.5, duration=1.73 - 1.5) print(expected.shape) print(l2.read_samples().shape) assert np.array_equal(l2.read_samples(), expected)
def test_compute_online(self): test_file_path = resources.sample_wav_file('wav_1.wav') y, sr = librosa.load(test_file_path, sr=None) # EXPECTED y_pad = np.pad(y, (0, 1024), mode='constant', constant_values=0) S = np.abs( librosa.stft(y_pad, center=False, n_fft=2048, hop_length=1024))**2 S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr) S = librosa.power_to_db(S) exp_onsets = librosa.onset.onset_strength(S=S, center=False).T exp_onsets = exp_onsets.reshape(exp_onsets.shape[0], 1) # ACTUAL test_file = tracks.FileTrack('idx', test_file_path) onset = pipeline.OnsetStrength() onset_gen = onset.process_track_online(test_file, 2048, 1024, chunk_size=5) chunks = list(onset_gen) onsets = np.vstack(chunks) print(onsets.shape, exp_onsets.shape) assert np.allclose(onsets, exp_onsets)
def test_split_sets_track(self): file = tracks.FileTrack('file-1', '/some/path') utt = tracks.Utterance('utt-1', file, start=0.0, end=10.0) res = utt.split([5.2]) assert len(res) == 2 assert res[0].track == file assert res[1].track == file
def test_read_samples_fix_sampling_rate(self, name, audio_path): audio_path = os.path.join(audio_path, name) file_obj = tracks.FileTrack('some_idx', audio_path) expected, __ = librosa.core.load(audio_path, sr=16000, mono=True) actual = file_obj.read_samples(sr=16000) assert np.array_equal(actual, expected)
def test_import_utterance_no_track(self, corpus): importing_utterances = [ tracks.Utterance('a', tracks.FileTrack('notexist', 'notexist'), corpus.issuers['existing_issuer'], 0, 10) ] with pytest.raises(ValueError): corpus.import_utterances(importing_utterances)
def test_process_empty_track_raises_error(self, processor, tmpdir): wav_path = os.path.join(tmpdir.strpath, 'file.wav') wav_content = np.random.random(0) librosa.output.write_wav(wav_path, wav_content, 16000) file_track = tracks.FileTrack('idx', wav_path) with pytest.raises(ValueError): processor.process_track(file_track, frame_size=4096, hop_size=2048, sr=16000)
def test_utt_read_samples(benchmark): utts = [] wav_path = resources.get_test_resource_path(('wav_files', 'med_len.wav')) track = tracks.FileTrack('idx', wav_path) utts.append(tracks.Utterance('uidx', track)) utts.append(tracks.Utterance('uidx', track, start=2.8)) utts.append(tracks.Utterance('uidx', track, end=10.2)) utts.append(tracks.Utterance('uidx', track, start=2.4, end=9.8)) mp3_path = resources.get_test_resource_path( ('audio_formats', 'mp3_2_44_1k_16b.mp3')) track = tracks.FileTrack('idx', mp3_path) utts.append(tracks.Utterance('uidx', track)) utts.append(tracks.Utterance('uidx', track, start=2.8)) utts.append(tracks.Utterance('uidx', track, end=4.9)) utts.append(tracks.Utterance('uidx', track, start=0.4, end=4.8)) benchmark(run, utts)
def _load(self, path): corpus = audiomate.Corpus() article_paths = sorted(self.get_articles(path)) reader_map = {} file_map = {} for article_path in article_paths: audio_files = self.get_audio_file_info(article_path) reader_name, reader_gender = self.get_reader_info(article_path) segments = self.get_segments(article_path) if reader_name not in reader_map.keys(): speaker = issuers.Speaker('{:0>8}'.format(len(reader_map)), gender=reader_gender) reader_map[reader_name] = speaker corpus.import_issuers(speaker) else: speaker = reader_map[reader_name] for start, end, text in segments: file_path = self.find_audio_file_for_segment( start, end, audio_files) if file_path is not None: if file_path not in file_map.keys(): track = tracks.FileTrack( '{:0>10}'.format(len(file_map)), file_path) file_map[file_path] = track corpus.import_tracks(track) else: track = file_map[file_path] track_offset = audio_files[file_path] utt_start = start - track_offset utt_end = end - track_offset utt_idx = '{}_{}_{}_{}'.format(speaker.idx, track.idx, int(start * 1000), int(end * 1000)) if utt_idx not in self.invalid_utterance_ids: utt = corpus.new_utterance(utt_idx, track.idx, issuer_idx=speaker.idx, start=utt_start, end=utt_end) ll = annotations.LabelList.create_single( text, audiomate.corpus.LL_WORD_TRANSCRIPT) utt.set_label_list(ll) return audiomate.Corpus.from_corpus(corpus)
def test_write_wav(tmpdir): samples = np.random.random(50000) sr = 16000 path = os.path.join(tmpdir.strpath, 'audio.wav') audio.write_wav(path, samples, sr=sr) assert os.path.isfile(path) track = tracks.FileTrack('idx', path) assert np.allclose(samples, track.read_samples(), atol=1.e-04)
def corpus(): corpus = audiomate.Corpus() ex_file = tracks.FileTrack('existing_file', '../any/path.wav') ex_issuer = issuers.Issuer('existing_issuer') ex_utterance = tracks.Utterance('existing_utt', ex_file, issuer=ex_issuer) corpus.tracks['existing_file'] = ex_file corpus.issuers['existing_issuer'] = ex_issuer corpus.utterances['existing_utt'] = ex_utterance return corpus
def test_read_samples_range(self, name, audio_path): audio_path = os.path.join(audio_path, name) file_obj = tracks.FileTrack('some_idx', audio_path) expected, __ = librosa.core.load(audio_path, sr=None, mono=True, offset=1.0, duration=1.7) actual = file_obj.read_samples(offset=1.0, duration=1.7) assert np.array_equal(actual, expected)
def generate_tracks(n, rand=None): if rand is None: rand = random.Random() items = [] for i in range(n): track_idx = 'track-{}'.format(i) path = '/fake/{}.wav'.format(track_idx) track = tracks.FileTrack(track_idx, path) items.append(track) return items
def test_process_track_with_downsampling(self, processor, tmpdir): wav_path = os.path.join(tmpdir.strpath, 'file.wav') wav_content = np.random.random(22) librosa.output.write_wav(wav_path, wav_content, 4) file_track = tracks.FileTrack('idx', wav_path) processed = processor.process_track(file_track, frame_size=4, hop_size=2, sr=2) assert processed.shape == (5, 4) assert processor.called_with_sr == [2] assert processor.called_with_offset == [0] assert processor.called_with_last == [True] assert processor.called_with_utterance == [None] assert processor.called_with_corpus == [None]
def test_process_track_smaller_than_frame_size(self, processor, tmpdir): wav_path = os.path.join(tmpdir.strpath, 'file.wav') wav_content = np.random.random(22) librosa.output.write_wav(wav_path, wav_content, 16000) file_track = tracks.FileTrack('idx', wav_path) processed = processor.process_track(file_track, frame_size=4096, hop_size=2048, sr=16000) assert processed.shape == (1, 4096) assert np.allclose(processed[0], np.pad(wav_content, (0, 4074), mode='constant'), atol=0.0001) assert processor.called_with_sr == [16000] assert processor.called_with_offset == [0] assert processor.called_with_last == [True] assert processor.called_with_utterance == [None] assert processor.called_with_corpus == [None]
def setup_method(self): self.ll_1 = annotations.LabelList(idx='alpha', labels=[ annotations.Label('a', 3.2, 4.5), annotations.Label('b', 5.1, 8.9), annotations.Label( 'c', 7.2, 10.5), annotations.Label('d', 10.5, 14), annotations.Label('d', 15, 18) ]) self.ll_2 = annotations.LabelList(idx='bravo', labels=[ annotations.Label('a', 1.0, 4.2), annotations.Label('e', 4.2, 7.9), annotations.Label( 'c', 7.2, 10.5), annotations.Label('f', 10.5, 14), annotations.Label('d', 15, 17.3) ]) self.ll_duplicate_idx = annotations.LabelList( idx='charlie', labels=[ annotations.Label('t', 1.0, 4.2), annotations.Label('h', 4.2, 7.9) ]) self.ll_3 = annotations.LabelList(idx='charlie', labels=[ annotations.Label('a', 1.0, 4.2), annotations.Label('g', 4.2, 7.9) ]) self.track = tracks.FileTrack('wav', resources.sample_wav_file('wav_1.wav')) self.issuer = issuers.Issuer('toni') self.utt = tracks.Utterance('test', self.track, issuer=self.issuer, start=1.25, end=1.30, label_lists=[ self.ll_1, self.ll_2, self.ll_duplicate_idx, self.ll_3 ])
def test_read_frames(self, tmpdir): wav_path = os.path.join(tmpdir.strpath, 'file.wav') wav_content = np.random.random(10044) librosa.output.write_wav(wav_path, wav_content, 16000) file_obj = tracks.FileTrack('some_idx', wav_path) data = list(file_obj.read_frames(frame_size=400, hop_size=160)) frames = np.array([x[0] for x in data]) last = [x[1] for x in data] assert frames.shape == (62, 400) assert frames.dtype == np.float32 assert np.allclose(frames[0], wav_content[:400], atol=0.0001) expect = np.pad(wav_content[9760:], (0, 116), mode='constant') assert np.allclose(frames[61], expect, atol=0.0001) assert last[:-1] == [False] * (len(data) - 1) assert last[-1]
def test_process_track(self, processor, tmpdir): wav_path = os.path.join(tmpdir.strpath, 'file.wav') wav_content = np.random.random(22) librosa.output.write_wav(wav_path, wav_content, 4) file_track = tracks.FileTrack('idx', wav_path) processed = processor.process_track(file_track, frame_size=4, hop_size=2) assert processed.shape == (10, 4) assert processed.dtype == np.float32 assert np.allclose(processed[0], wav_content[0:4], atol=0.0001) assert np.allclose(processed[9], wav_content[18:22], atol=0.0001) assert processor.called_with_sr == [4] assert processor.called_with_offset == [0] assert processor.called_with_last == [True] assert processor.called_with_utterance == [None] assert processor.called_with_corpus == [None]
def test_encode_utterance_takes_lower_index_first(self): file = tracks.FileTrack('file-idx', resources.sample_wav_file('wav_1.wav')) utt = tracks.Utterance('utt-idx', file, start=0, end=5) ll = annotations.LabelList(labels=[ annotations.Label('music', 0, 3), annotations.Label('speech', 3, 5) ]) utt.set_label_list(ll) enc = encoding.FrameOrdinalEncoder(['speech', 'music', 'noise'], 'default', frame_settings=units.FrameSettings( 32000, 16000), sr=16000) actual = enc.encode_utterance(utt) expected = np.array([1, 1, 0, 0]).astype(np.int) assert np.array_equal(expected, actual)
def new_file(self, path, track_idx, copy_file=False): """ Adds a new audio file to the corpus with the given data. Parameters: path (str): Path of the file to add. track_idx (str): The id to associate the file-track with. copy_file (bool): If True the file is copied to the data set folder, otherwise the given path is used directly. Returns: FileTrack: The newly added file. """ new_file_idx = track_idx new_file_path = os.path.abspath(path) # Add index to idx if already existing if new_file_idx in self._tracks.keys(): new_file_idx = naming.index_name_if_in_list( new_file_idx, self._tracks.keys()) # Copy file to default file dir if copy_file: if not os.path.isdir(self.path): raise ValueError( 'To copy file the dataset needs to have a path.') __, ext = os.path.splitext(path) new_file_folder = os.path.join(self.path, DEFAULT_FILE_SUBDIR) new_file_path = os.path.join(new_file_folder, '{}{}'.format(new_file_idx, ext)) os.makedirs(new_file_folder, exist_ok=True) shutil.copy(path, new_file_path) # Create file obj new_file = tracks.FileTrack(new_file_idx, new_file_path) self._tracks[new_file_idx] = new_file return new_file
def test_compute_online(self): # Data: 41523 samples, 16 kHz # yields 40 frames with frame-size 2048 and hop-size 1024 test_file_path = resources.sample_wav_file('wav_1.wav') y, sr = librosa.load(test_file_path, sr=None) # EXPECTED y_pad = np.pad(y, (0, 1024), mode='constant', constant_values=0) S = np.abs(librosa.stft(y_pad, center=False, n_fft=2048, hop_length=1024)) ** 2 S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr) S = librosa.power_to_db(S) onsets = librosa.onset.onset_strength(S=S, center=False) exp_tgram = librosa.feature.tempogram(onset_envelope=onsets, sr=sr, win_length=4, center=True).T # ACTUAL test_file = tracks.FileTrack('idx', test_file_path) tgram_step = pipeline.Tempogram(win_length=4) tgram_gen = tgram_step.process_track_online(test_file, 2048, 1024, chunk_size=5) chunks = list(tgram_gen) tgrams = np.vstack(chunks) assert np.allclose(tgrams, exp_tgram)
def sample_utterance(): file_track = tracks.FileTrack('test_file', resources.sample_wav_file('wav_1.wav')) utterance = tracks.Utterance('test', file_track) return utterance
def test_read_samples(benchmark): wav_path = resources.get_test_resource_path(('wav_files', 'med_len.wav')) track = tracks.FileTrack('idx', wav_path) benchmark(run, track)
def test_contains_track_returns_false(self, ds): track = tracks.FileTrack('wav-1', '/some/other/path/here') res = ds.contains_track(track) assert not res