def write_labels(path, corpus): records = collections.defaultdict(list) for utterance in corpus.utterances.values(): for label_list_idx, label_list in utterance.label_lists.items(): utt_records = [] for l in label_list: start = l.start end = l.end if end == float('inf'): end = -1 if len(l.meta) > 0: value = '{} [{}]'.format( l.value, json.dumps(l.meta, sort_keys=True)) utt_records.append((utterance.idx, start, end, value)) else: utt_records.append( (utterance.idx, start, end, l.value)) records[label_list_idx].extend(utt_records) for label_list_idx, label_list_records in records.items(): file_path = os.path.join( path, '{}_{}.txt'.format(LABEL_FILE_PREFIX, label_list_idx)) textfile.write_separated_lines(file_path, label_list_records, separator=' ')
def write_tracks(file_path, corpus, path): file_records = [] export_path = os.path.join(path, 'audio') for track in corpus.tracks.values(): if isinstance(track, tracks.FileTrack): file_records.append( [track.idx, KaldiWriter.extended_filename(track)]) elif isinstance(track, tracks.ContainerTrack): if not os.path.isdir(export_path): os.makedirs(export_path) target_path = os.path.join(export_path, '{}.wav'.format(track.idx)) max_value = np.iinfo(np.int16).max samples = (track.read_samples() * max_value).astype(np.int16) sampling_rate = track.sampling_rate scipy.io.wavfile.write(target_path, sampling_rate, samples) file_records.append([track.idx, target_path]) textfile.write_separated_lines(file_path, file_records, separator=' ', sort_by_column=0)
def _save(self, corpus, path): records = [] subset_utterance_ids = { idx: list(subset.utterances.keys()) for idx, subset in corpus.subviews.items() } subset_records = collections.defaultdict(list) audio_folder = os.path.join(path, 'audio') os.makedirs(audio_folder, exist_ok=True) for utterance_idx in sorted(corpus.utterances.keys()): utterance = corpus.utterances[utterance_idx] export_audio = False if utterance.start != 0 or utterance.end != float('inf'): export_audio = True elif utterance.sampling_rate != 16000: # We force sr=16000, since this is expected from wav2letter export_audio = True if export_audio: audio_path = os.path.join(audio_folder, '{}.wav'.format(utterance.idx)) data = utterance.read_samples(sr=16000) data = (data * 32768).astype(np.int16) num_samples = data.size scipy.io.wavfile.write(audio_path, 16000, data) else: audio_path = utterance.track.path num_samples = utterance.num_samples() transcript = utterance.label_lists[ self.transcription_label_list_idx].join() # Add to the full list record = [utterance_idx, audio_path, num_samples, transcript] records.append(record) # Check / Add to subview lists for subset_idx, utt_ids in subset_utterance_ids.items(): if utterance_idx in utt_ids: subset_records[subset_idx].append(record) # Write full list records_path = os.path.join(path, 'all.lst') textfile.write_separated_lines(records_path, records, separator=' ', sort_by_column=-1) # Write subset lists for subset_idx, records in subset_records.items(): if len(records) > 0: subset_file_path = os.path.join(path, '{}.lst'.format(subset_idx)) textfile.write_separated_lines(subset_file_path, records, separator=' ', sort_by_column=-1)
def test_write_separated_lines_sorted(self): data = { 'hallo-0_103': 'hallo-0_1', 'hallo-0_122': 'hallo-0', 'hallo-0_1031': 'hallo-0_1', 'hallo-0_1322': 'hallo-0', 'hallo-0_1224': 'hallo-0' } f, path = tempfile.mkstemp(text=True) os.close(f) textfile.write_separated_lines(path, data, separator=' ', sort_by_column=1) f = open(path, 'r') value = f.read() f.close() lines = value.strip().split('\n') self.assertEqual(5, len(lines)) self.assertTrue(lines[0].endswith('hallo-0')) self.assertTrue(lines[1].endswith('hallo-0')) self.assertTrue(lines[2].endswith('hallo-0')) self.assertTrue(lines[3].endswith('hallo-0_1')) self.assertTrue(lines[4].endswith('hallo-0_1'))
def write_files(file_path, corpus, path): file_records = [[file.idx, os.path.relpath(file.path, path)] for file in corpus.files.values()] textfile.write_separated_lines(file_path, file_records, separator=' ', sort_by_column=0)
def _download(self, target_path): temp_path = os.path.join(target_path, 'temp') os.makedirs(temp_path, exist_ok=True) sentence_ark = os.path.join(temp_path, 'sentences.tar.bz2') sentence_list = os.path.join(temp_path, 'sentences.csv') audio_ark = os.path.join(temp_path, 'sentences_with_audio.tar.bz2') audio_list = os.path.join(temp_path, 'sentences_with_audio.csv') download.download_file(SENTENCE_LIST_URL, sentence_ark) download.download_file(AUDIO_LIST_URL, audio_ark) download.extract_tar(sentence_ark, temp_path) download.extract_tar(audio_ark, temp_path) audio_entries = self._load_audio_list(audio_list) sentences = self._load_sentence_list(sentence_list) valid_sentence_ids = set(audio_entries.keys()).intersection( set(sentences.keys())) # sent-id, username, lang, transcript all_records = [(k, audio_entries[k][0], sentences[k][0], sentences[k][1]) for k in valid_sentence_ids] meta_path = os.path.join(target_path, META_FILENAME) textfile.write_separated_lines(meta_path, all_records, separator='\t', sort_by_column=0) self._download_audio_files(all_records, target_path) shutil.rmtree(temp_path, ignore_errors=True)
def write_utt_to_issuer_mapping(utt_issuer_path, corpus): utt_issuer_records = {} for utterance in corpus.utterances.values(): if utterance.issuer is not None: utt_issuer_records[utterance.idx] = utterance.issuer.idx textfile.write_separated_lines(utt_issuer_path, utt_issuer_records, separator=' ', sort_by_column=0)
def write_feature_containers(container_path, corpus): feat_records = [ (idx, container.path) for idx, container in corpus.feature_containers.items() ] textfile.write_separated_lines(container_path, feat_records, separator=' ')
def _save(self, corpus, path): records = [] subset_utterance_ids = { idx: list(subset.utterances.keys()) for idx, subset in corpus.subviews.items() } subset_records = collections.defaultdict(list) audio_folder = os.path.join(path, 'audio') os.makedirs(audio_folder, exist_ok=True) for utterance_idx in sorted(corpus.utterances.keys()): utterance = corpus.utterances[utterance_idx] if utterance.start == 0 and utterance.end == -1: audio_path = utterance.file.path else: audio_path = os.path.join(audio_folder, '{}.wav'.format(utterance.idx)) sampling_rate = utterance.sampling_rate data = utterance.read_samples() data = (data * 32768).astype(np.int16) scipy.io.wavfile.write(audio_path, sampling_rate, data) size = os.stat(audio_path).st_size transcript = utterance.label_lists[ self.transcription_label_list_idx][0].value # Add to the full list record = [audio_path, size, transcript] records.append(record) # Check / Add to subview lists for subset_idx, utt_ids in subset_utterance_ids.items(): if utterance_idx in utt_ids: subset_records[subset_idx].append(record) # Write full list records.insert(0, ['wav_filename', 'wav_filesize', 'transcript']) records_path = os.path.join(path, 'all.csv') textfile.write_separated_lines(records_path, records, separator=',', sort_by_column=-1) # Write subset lists for subset_idx, records in subset_records.items(): if len(records) > 0: records.insert(0, ['wav_filename', 'wav_filesize', 'transcript']) subset_file_path = os.path.join(path, '{}.csv'.format(subset_idx)) textfile.write_separated_lines(subset_file_path, records, separator=',', sort_by_column=-1)
def write_utterances(utterance_path, corpus): utterance_records = { utterance.idx: [utterance.file.idx, utterance.start, utterance.end] for utterance in corpus.utterances.values() } textfile.write_separated_lines(utterance_path, utterance_records, separator=' ', sort_by_column=0)
def write_file_tracks(file_path, corpus, path): file_records = [] for file in corpus.tracks.values(): if isinstance(file, tracks.FileTrack): file_records.append([ file.idx, os.path.relpath(file.path, path) ]) textfile.write_separated_lines(file_path, file_records, separator=' ', sort_by_column=0)
def write_container_tracks(audio_path, corpus, path): container_records = set({}) for track in corpus.tracks.values(): if isinstance(track, tracks.ContainerTrack): rel_path = os.path.relpath(track.container.path, path) container_records.add((track.idx, rel_path, track.key)) textfile.write_separated_lines(audio_path, container_records, separator=' ', sort_by_column=0)
def write_label_list(path, label_list): """ Writes the given `label_list` to an audacity label file. Args: path (str): Path to write the file to. label_list (audiomate.corpus.assets.LabelList): Label list """ entries = [] for label in label_list: entries.append([label.start, label.end, label.value]) textfile.write_separated_lines(path, entries, separator='\t')
def _write_transcriptions(self, text_path, corpus): transcriptions = {} for utterance in corpus.utterances.values(): if self.main_label_list_idx in utterance.label_lists.keys(): label_list = utterance.label_lists[self.main_label_list_idx] transcriptions[utterance.idx] = ' '.join( [l.value for l in label_list]) textfile.write_separated_lines(text_path, transcriptions, separator=' ', sort_by_column=0)
def _write_utt_to_issuer_mapping(self, utt_issuer_path, corpus): utt_issuer_records = {} for utterance in corpus.utterances.values(): utt_idx = self._get_utt_idx(utterance) if utterance.issuer is not None: utt_issuer_records[utt_idx] = utterance.issuer.idx elif self.use_utt_idx_if_no_speaker_available: utt_issuer_records[utt_idx] = utt_idx textfile.write_separated_lines(utt_issuer_path, utt_issuer_records, separator=' ', sort_by_column=0)
def _save(self, corpus, path): target_audio_path = os.path.join(path, 'audio') os.makedirs(target_audio_path, exist_ok=True) # Convert all files if not self.no_audio_check: corpus = self.converter.convert(corpus, target_audio_path) records = [] subset_utterance_ids = { idx: set(subset.utterances.keys()) for idx, subset in corpus.subviews.items() } subset_records = collections.defaultdict(list) for utterance_idx in sorted(corpus.utterances.keys()): utterance = corpus.utterances[utterance_idx] transcript = utterance.label_lists[ self.transcription_label_list_idx].join() audio_path = utterance.track.path size = os.stat(audio_path).st_size record = [audio_path, size, transcript] records.append(record) # Check / Add to subview lists for subset_idx, utt_ids in subset_utterance_ids.items(): if utterance_idx in utt_ids: subset_records[subset_idx].append(record) # Write full list records.insert(0, ['wav_filename', 'wav_filesize', 'transcript']) records_path = os.path.join(path, 'all.csv') textfile.write_separated_lines(records_path, records, separator=',', sort_by_column=-1) # Write subset lists for subset_idx, records in subset_records.items(): if len(records) > 0: records.insert(0, ['wav_filename', 'wav_filesize', 'transcript']) subset_file_path = os.path.join(path, '{}.csv'.format(subset_idx)) textfile.write_separated_lines(subset_file_path, records, separator=',', sort_by_column=-1)
def write_utterances(utterance_path, corpus): utterance_records = {} for utterance in corpus.utterances.values(): track_idx = utterance.track.idx start = utterance.start end = utterance.end if end == float('inf'): end = -1 utterance_records[utterance.idx] = [track_idx, start, end] textfile.write_separated_lines(utterance_path, utterance_records, separator=' ', sort_by_column=0)
def _write_genders(self, gender_path, corpus): genders = {} for issuer in corpus.issuers.values(): if type(issuer) == issuers.Speaker: if issuer.gender == issuers.Gender.MALE: genders[issuer.idx] = 'm' elif issuer.gender == issuers.Gender.FEMALE: genders[issuer.idx] = 'f' if len(genders) > 0: textfile.write_separated_lines(gender_path, genders, separator=' ', sort_by_column=0)
def _save(self, corpus, path): target_audio_path = os.path.join(path, 'audio') os.makedirs(target_audio_path, exist_ok=True) # Convert all files corpus = self.converter.convert(corpus, target_audio_path) records = [] subset_utterance_ids = { idx: list(subset.utterances.keys()) for idx, subset in corpus.subviews.items() } subset_records = collections.defaultdict(list) for utterance_idx in sorted(corpus.utterances.keys()): utterance = corpus.utterances[utterance_idx] transcript = utterance.label_lists[ self.transcription_label_list_idx].join() audio_path = utterance.track.path num_samples = int(utterance.duration * self.sampling_rate) # Add to the full list record = [utterance_idx, audio_path, num_samples, transcript] records.append(record) # Check / Add to subview lists for subset_idx, utt_ids in subset_utterance_ids.items(): if utterance_idx in utt_ids: subset_records[subset_idx].append(record) # Write full list records_path = os.path.join(path, 'all.lst') textfile.write_separated_lines(records_path, records, separator=' ', sort_by_column=-1) # Write subset lists for subset_idx, records in subset_records.items(): if len(records) > 0: subset_file_path = os.path.join(path, '{}.lst'.format(subset_idx)) textfile.write_separated_lines(subset_file_path, records, separator=' ', sort_by_column=-1)
def write_segments(utterance_path, corpus): utterances = corpus.utterances.values() utterance_records = {} for u in utterances: track_idx = u.track.idx start = u.start end = u.end_abs if end == float('inf'): end = -1 utterance_records[u.idx] = [track_idx, start, end] textfile.write_separated_lines(utterance_path, utterance_records, separator=' ', sort_by_column=0)
def write_label_file(path, entries): """ Writes an audacity label file. Start and end times are in seconds. Args: path (str): Path to write the file to. entries (list): List with entries to write. Example:: >>> data = [ >>> [0.0, 0.2, 'sie'], >>> [0.2, 2.2, 'hallo'] >>> ] >>> >>> write_label_file('/some/path/to/file.txt', data) """ textfile.write_separated_lines(path, entries, separator='\t')
def write_file(path, entries): """ Writes a ctm file. Args: path (str): Path to write the file to. entries (list): List with entries to write. (entries -> wave-file, channel, start (seconds), duration (seconds), label) Example:: >>> data = [ >>> ["wave-ab", '1', 0.0, 0.82, "duda"], >>> ["wave-xy", '1', 0.82, 0.57, "Jacques"], >>> ] >>> >>> write_file('/path/to/file.txt', data) """ textfile.write_separated_lines(path, entries, separator=' ')
def _write_segments(self, utterance_path, corpus): utterances = corpus.utterances.values() utterance_records = {} for u in utterances: utt_idx = self._get_utt_idx(u) track_idx = u.track.idx start = u.start end = u.end if end == float('inf'): if self.use_absolute_times: end = u.end_abs else: end = -1 utterance_records[utt_idx] = [track_idx, start, end] textfile.write_separated_lines(utterance_path, utterance_records, separator=' ', sort_by_column=0)