def read_genders(genders_path, corpus): if os.path.isfile(genders_path): speakers = textfile.read_key_value_lines(genders_path, separator=' ') for speaker_idx, gender_str in speakers.items(): if gender_str == 'm': gender = issuers.Gender.MALE else: gender = issuers.Gender.FEMALE speaker = issuers.Speaker(speaker_idx, gender=gender) corpus.import_issuers(speaker)
def load_books_of_speaker(self, corpus, path, speaker): """ Load all utterances for the speaker at the given path. """ utt_ids = [] for book_path in MailabsReader.get_folders(path): meta_path = os.path.join(book_path, 'metadata.csv') wavs_path = os.path.join(book_path, 'wavs') meta = textfile.read_separated_lines(meta_path, separator='|', max_columns=3) for entry in meta: file_basename = entry[0] transcription_raw = entry[1] transcription_clean = entry[2] if speaker is None: idx = file_basename utt_speaker = issuers.Speaker(idx) speaker_idx = idx corpus.import_issuers(utt_speaker) else: idx = '{}-{}'.format(speaker.idx, file_basename) speaker_idx = speaker.idx wav_name = '{}.wav'.format(file_basename) wav_path = os.path.join(wavs_path, wav_name) if os.path.isfile( wav_path) and idx not in self.invalid_utterance_ids: corpus.new_file(wav_path, idx) ll_raw = annotations.LabelList.create_single( transcription_raw, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW) ll_clean = annotations.LabelList.create_single( transcription_clean, idx=audiomate.corpus.LL_WORD_TRANSCRIPT) utterance = corpus.new_utterance(idx, idx, speaker_idx) utterance.set_label_list(ll_raw) utterance.set_label_list(ll_clean) utt_ids.append(utterance.idx) return utt_ids
def _create_or_get_speech_issuer(corpus, file_idx, labels): if file_idx not in labels: return None issuer = issuers.Speaker(file_idx) if file_idx in labels: if labels[file_idx][0] == 'm': issuer.gender = issuers.Gender.MALE elif labels[file_idx][0] == 'f': issuer.gender = issuers.Gender.FEMALE corpus.import_issuers(issuer) return file_idx
def parse_speaker_info(readme_path): """ Parse speaker info and return tuple (idx, gender). """ idx = None gender = issuers.Gender.UNKNOWN age_group = issuers.AgeGroup.UNKNOWN native_lang = None with open(readme_path, 'r', errors='ignore') as f: for raw_line in f: line = raw_line.strip() if line is not None and line != '': line = line.rstrip(';.') parts = line.split(':', maxsplit=1) if len(parts) > 1: key = parts[0].strip().lower() value = parts[1].strip() if key == 'user name': idx = value value = value.lower() if key == 'gender': if value in ['männlich', 'male', 'mnnlich']: gender = issuers.Gender.MALE elif value in ['weiblich', 'female', '[female]']: gender = issuers.Gender.FEMALE if key == 'age range': if value in ['erwachsener', 'adult', '[adult]', '[erwachsener]']: age_group = issuers.AgeGroup.ADULT elif value in ['senior', '[senior']: age_group = issuers.AgeGroup.SENIOR elif value in ['youth', 'jugendlicher', '[youth]', '[jugendlicher]']: age_group = issuers.AgeGroup.YOUTH elif value in ['kind', 'child']: age_group = issuers.AgeGroup.CHILD if key == 'language': if value in ['de', 'ger', 'deu', '[de]']: native_lang = 'deu' elif value in ['en', 'eng', '[en]']: native_lang = 'eng' return issuers.Speaker(idx, gender=gender, age_group=age_group, native_language=native_lang)
def load_speaker(corpus, path): """ Create a speaker instance for the given path. """ base_path, speaker_name = os.path.split(path) base_path, gender_desc = os.path.split(base_path) base_path, _ = os.path.split(base_path) base_path, _ = os.path.split(base_path) gender = issuers.Gender.UNKNOWN if gender_desc == 'male': gender = issuers.Gender.MALE elif gender_desc == 'female': gender = issuers.Gender.FEMALE speaker = issuers.Speaker(speaker_name, gender=gender) corpus.import_issuers(speaker) return speaker
def _load_folder(folder_entry, corpus): """ Load the given subfolder into the corpus (e.g. bed, one, ...) """ for wav_path in glob.glob(os.path.join(folder_entry.path, '*.wav')): wav_name = os.path.basename(wav_path) basename, __ = os.path.splitext(wav_name) command = folder_entry.name file_idx = '{}_{}'.format(basename, command) issuer_idx = str(basename).split('_', maxsplit=1)[0] corpus.new_file(wav_path, file_idx) if issuer_idx not in corpus.issuers.keys(): corpus.import_issuers(issuers.Speaker(issuer_idx)) utt = corpus.new_utterance(file_idx, file_idx, issuer_idx) labels = annotations.LabelList.create_single( command, idx=audiomate.corpus.LL_WORD_TRANSCRIPT) utt.set_label_list(labels)
def _load(self, path): corpus = audiomate.Corpus(path=path) for file_path in glob.glob(os.path.join(path, 'recordings', '*.wav')): file_idx = os.path.splitext(os.path.basename(file_path))[0] corpus.new_file(file_path, file_idx) idx_parts = file_idx.split('_') digit = idx_parts[0] issuer_idx = '_'.join(idx_parts[1:-1]) if issuer_idx not in corpus.issuers.keys(): issuer = issuers.Speaker(issuer_idx) corpus.import_issuers(issuer) utterance = corpus.new_utterance(file_idx, file_idx, issuer_idx) utterance.set_label_list( annotations.LabelList.create_single( str(digit), idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) return corpus
def create_assets_if_needed(corpus, path, entry): """ Create File/Utterance/Issuer, if they not already exist and return utt-idx. """ file_name = entry[1] file_idx, _ = os.path.splitext(file_name) if file_idx in INVALID_UTTS: return None if file_idx not in corpus.utterances.keys(): speaker_idx = entry[0] transcription = entry[2] if len(entry) >= 6: age = CommonVoiceReader.map_age(entry[5]) else: age = issuers.AgeGroup.UNKNOWN if len(entry) >= 7: gender = CommonVoiceReader.map_gender(entry[6]) else: gender = issuers.Gender.UNKNOWN file_path = os.path.join(path, 'clips', file_name) corpus.new_file(file_path, file_idx) if speaker_idx in corpus.issuers.keys(): issuer = corpus.issuers[speaker_idx] else: issuer = issuers.Speaker(speaker_idx, gender=gender, age_group=age) corpus.import_issuers(issuer) utterance = corpus.new_utterance(file_idx, file_idx, issuer.idx) utterance.set_label_list( annotations.LabelList.create_single( transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) return file_idx
def _load(self, path): corpus = audiomate.Corpus(path=path) data_path = os.path.join(path, 'data') meta_data = AudioMNISTReader.load_speaker_meta(path) for speaker_idx in os.listdir(data_path): speaker_path = os.path.join(data_path, speaker_idx) if os.path.isdir(speaker_path): for file_path in glob.glob(os.path.join(speaker_path, '*.wav')): file_idx = os.path.splitext(os.path.basename(file_path))[0] corpus.new_file(file_path, file_idx) idx_parts = file_idx.split('_') digit = idx_parts[0] if speaker_idx not in corpus.issuers.keys(): issuer = issuers.Speaker( speaker_idx, gender=AudioMNISTReader.get_gender( meta_data, speaker_idx), age_group=AudioMNISTReader.get_age_group( meta_data, speaker_idx)) corpus.import_issuers(issuer) utterance = corpus.new_utterance(file_idx, file_idx, speaker_idx) utterance.set_label_list( annotations.LabelList.create_single( str(digit), idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) return corpus
def generate_issuers(n, rand=None): if rand is None: rand = random.Random() items = [] for issuer_index in range(n): issuer_idx = 'issuer-{}'.format(issuer_index) issuer_type = rand.randint(1, 3) if issuer_type == 1: issuer = issuers.Speaker(issuer_idx, gender=issuers.Gender.UNKNOWN, age_group=issuers.AgeGroup.CHILD, native_language='de') elif issuer_type == 2: issuer = issuers.Artist(issuer_idx, 'badam') else: issuer = issuers.Issuer(issuer_idx) items.append(issuer) return items
def load_file(self, folder_path, idx, corpus): """ Load speaker, file, utterance, labels for the file with the given id. """ xml_path = os.path.join(folder_path, '{}.xml'.format(idx)) wav_paths = [] for wav_suffix in WAV_FILE_SUFFIXES: wav_path = os.path.join(folder_path, '{}_{}.wav'.format(idx, wav_suffix)) wav_name = os.path.split(wav_path)[1] wav_idx = os.path.splitext(wav_name)[0] if os.path.isfile( wav_path) and wav_idx not in self.invalid_utterance_ids: wav_paths.append(wav_path) if len(wav_paths) == 0: return [] with open(xml_path, 'r', encoding='utf-8') as f: text = f.read() transcription = TudaReader.extract_value(text, TRANSCRIPTION_PATTERN, 'transcription', xml_path) transcription_raw = TudaReader.extract_value( text, RAW_TRANSCRIPTION_PATTERN, 'raw_transcription', xml_path) gender = TudaReader.extract_value(text, GENDER_PATTERN, 'gender', xml_path) is_native = TudaReader.extract_value(text, NATIVE_PATTERN, 'native', xml_path) age_class = TudaReader.extract_value(text, AGE_PATTERN, 'age', xml_path) speaker_idx = TudaReader.extract_value(text, SPEAKER_IDX_PATTERN, 'speaker_idx', xml_path) if speaker_idx not in corpus.issuers.keys(): start_age_class = int(age_class.split('-')[0]) if start_age_class < 12: age_group = issuers.AgeGroup.CHILD elif start_age_class < 18: age_group = issuers.AgeGroup.YOUTH elif start_age_class < 65: age_group = issuers.AgeGroup.ADULT else: age_group = issuers.AgeGroup.SENIOR native_lang = None if is_native == 'Ja': native_lang = 'deu' issuer = issuers.Speaker(speaker_idx, gender=issuers.Gender(gender), age_group=age_group, native_language=native_lang) corpus.import_issuers(issuer) utt_ids = [] for wav_path in wav_paths: wav_name = os.path.split(wav_path)[1] wav_idx = os.path.splitext(wav_name)[0] corpus.new_file(wav_path, wav_idx) utt = corpus.new_utterance(wav_idx, wav_idx, speaker_idx) utt.set_label_list( annotations.LabelList.create_single( transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) utt.set_label_list( annotations.LabelList.create_single( transcription_raw, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW)) utt_ids.append(wav_idx) return utt_ids
def load_file(folder_path, idx, corpus): """ Load speaker, file, utterance, labels for the file with the given id. """ xml_path = os.path.join(folder_path, '{}.xml'.format(idx)) wav_paths = glob.glob(os.path.join(folder_path, '{}_*.wav'.format(idx))) if len(wav_paths) == 0: return [] xml_file = open(xml_path, 'r', encoding='utf-8') soup = BeautifulSoup(xml_file, 'lxml') transcription = soup.recording.cleaned_sentence.string transcription_raw = soup.recording.sentence.string gender = soup.recording.gender.string is_native = soup.recording.muttersprachler.string age_class = soup.recording.ageclass.string speaker_idx = soup.recording.speaker_id.string if speaker_idx not in corpus.issuers.keys(): start_age_class = int(age_class.split('-')[0]) if start_age_class < 12: age_group = issuers.AgeGroup.CHILD elif start_age_class < 18: age_group = issuers.AgeGroup.YOUTH elif start_age_class < 65: age_group = issuers.AgeGroup.ADULT else: age_group = issuers.AgeGroup.SENIOR native_lang = None if is_native == 'Ja': native_lang = 'deu' issuer = issuers.Speaker(speaker_idx, gender=issuers.Gender(gender), age_group=age_group, native_language=native_lang) corpus.import_issuers(issuer) utt_ids = [] for wav_path in wav_paths: wav_name = os.path.split(wav_path)[1] wav_idx = os.path.splitext(wav_name)[0] corpus.new_file(wav_path, wav_idx) utt = corpus.new_utterance(wav_idx, wav_idx, speaker_idx) utt.set_label_list( annotations.LabelList.create_single( transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) utt.set_label_list( annotations.LabelList.create_single( transcription_raw, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW)) utt_ids.append(wav_idx) return utt_ids
def _load(self, path): corpus = audiomate.Corpus(path=path) for part in ['TEST', 'TRAIN']: part_path = os.path.join(path, part) part_utt_ids = set() for region in os.listdir(part_path): region_path = os.path.join(part_path, region) if os.path.isdir(region_path): for speaker_abbr in os.listdir(region_path): speaker_path = os.path.join(region_path, speaker_abbr) speaker_idx = speaker_abbr[1:] if speaker_idx not in corpus.issuers.keys(): issuer = issuers.Speaker(speaker_idx) if speaker_abbr[:1] == 'M': issuer.gender = issuers.Gender.MALE elif speaker_abbr[:1] == 'F': issuer.gender = issuers.Gender.FEMALE corpus.import_issuers(issuer) for wav_path in glob.glob( os.path.join(speaker_path, '*.WAV')): sentence_idx = os.path.splitext( os.path.basename(wav_path))[0] utt_idx = '{}-{}-{}'.format( region, speaker_abbr, sentence_idx).lower() part_utt_ids.add(utt_idx) raw_text_path = os.path.join( speaker_path, '{}.TXT'.format(sentence_idx)) raw_text = textfile.read_separated_lines( raw_text_path, separator=' ', max_columns=3)[0][2] words_path = os.path.join( speaker_path, '{}.WRD'.format(sentence_idx)) words = textfile.read_separated_lines( words_path, separator=' ', max_columns=3) phones_path = os.path.join( speaker_path, '{}.PHN'.format(sentence_idx)) phones = textfile.read_separated_lines( phones_path, separator=' ', max_columns=3) corpus.new_file(wav_path, utt_idx) utt = corpus.new_utterance(utt_idx, utt_idx, speaker_idx) raw_ll = annotations.LabelList.create_single( raw_text, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW) utt.set_label_list(raw_ll) word_ll = annotations.LabelList( idx=audiomate.corpus.LL_WORD_TRANSCRIPT) for record in words: start = int(record[0]) / 16000 end = int(record[1]) / 16000 word_ll.addl(record[2], start=start, end=end) utt.set_label_list(word_ll) phone_ll = annotations.LabelList( idx=audiomate.corpus.LL_PHONE_TRANSCRIPT) for record in phones: start = int(record[0]) / 16000 end = int(record[1]) / 16000 phone_ll.addl(record[2], start=start, end=end) utt.set_label_list(phone_ll) utt_filter = subset.MatchingUtteranceIdxFilter( utterance_idxs=part_utt_ids) subview = subset.Subview(corpus, filter_criteria=[utt_filter]) corpus.import_subview(part, subview) return corpus
def _load(self, path): corpus = audiomate.Corpus() article_paths = sorted(self.get_articles(path)) reader_map = {} file_map = {} for article_idx, article_path in enumerate(article_paths): audio_files = self.get_audio_file_info(article_path) reader_name, reader_gender = self.get_reader_info(article_path) segments = self.get_segments(article_path) if reader_name not in reader_map.keys(): speaker = issuers.Speaker( '{:0>8}'.format(len(reader_map)), gender=reader_gender ) reader_map[reader_name] = speaker corpus.import_issuers(speaker) else: speaker = reader_map[reader_name] for start, end, text in segments: file_path = self.find_audio_file_for_segment(start, end, audio_files) if file_path is not None: if file_path not in file_map.keys(): track = tracks.FileTrack( '{:0>10}'.format(len(file_map)), file_path ) file_map[file_path] = track corpus.import_tracks(track) else: track = file_map[file_path] track_offset = audio_files[file_path] utt_start = start - track_offset utt_end = end - track_offset utt_idx = '{}_{}_{}_{}'.format( speaker.idx, track.idx, int(start * 1000), int(end * 1000) ) if utt_idx not in self.invalid_utterance_ids: utt = corpus.new_utterance( utt_idx, track.idx, issuer_idx=speaker.idx, start=utt_start, end=utt_end ) ll = annotations.LabelList.create_single( text, audiomate.corpus.LL_WORD_TRANSCRIPT ) utt.set_label_list(ll) return audiomate.Corpus.from_corpus(corpus)