def _load(self, path): corpus = audiomate.Corpus(path=path) meta_file = os.path.join(path, META_FILENAME) records = textfile.read_separated_lines_generator(meta_file, separator='\t', max_columns=4) for record in records: idx = record[0] speaker_idx = record[1] language = record[2] transcript = record[3] file_path = os.path.join(path, 'audio', language, '{}.mp3'.format(idx)) corpus.new_file(file_path, idx) if speaker_idx not in corpus.issuers.keys(): issuer = assets.Speaker(speaker_idx) corpus.import_issuers(issuer) utterance = corpus.new_utterance(idx, idx, speaker_idx) utterance.set_label_list( assets.LabelList.create_single( transcript, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW)) return corpus
def read_issuers(file_path, corpus): if not os.path.isfile(file_path): return data = jsonfile.read_json_file(file_path) for issuer_idx, issuer_data in data.items(): issuer_type = issuer_data.get('type', None) issuer_info = issuer_data.get('info', {}) if issuer_type == 'speaker': gender = assets.Gender( issuer_data.get('gender', 'unknown').lower()) age_group = assets.AgeGroup( issuer_data.get('age_group', 'unknown').lower()) native_language = issuer_data.get('native_language', None) issuer = assets.Speaker(issuer_idx, gender=gender, age_group=age_group, native_language=native_language, info=issuer_info) elif issuer_type == 'artist': name = issuer_data.get('name', None) issuer = assets.Artist(issuer_idx, name=name, info=issuer_info) else: issuer = assets.Issuer(issuer_idx, info=issuer_info) corpus.import_issuers(issuer)
def load_subset(corpus, path, subset_idx): """ Load subset into corpus. """ csv_file = os.path.join(path, '{}.csv'.format(subset_idx)) utt_ids = [] for entry in textfile.read_separated_lines_generator( csv_file, separator=',', max_columns=8, ignore_lines_starting_with=['filename']): rel_file_path = entry[0] filename = os.path.split(rel_file_path)[1] basename = os.path.splitext(filename)[0] transcription = entry[1] age = CommonVoiceReader.map_age(entry[4]) gender = CommonVoiceReader.map_gender(entry[5]) idx = '{}-{}'.format(subset_idx, basename) file_path = os.path.join(path, rel_file_path) corpus.new_file(file_path, idx) issuer = assets.Speaker(idx, gender=gender, age_group=age) corpus.import_issuers(issuer) utterance = corpus.new_utterance(idx, idx, issuer.idx) utterance.set_label_list( assets.LabelList.create_single( transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) utt_ids.append(idx) filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(utt_ids)) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(subset_idx, subview)
def load_file(folder_path, idx, corpus): """ Load speaker, file, utterance, labels for the file with the given id. """ xml_path = os.path.join(folder_path, '{}.xml'.format(idx)) wav_path = os.path.join(folder_path, '{}{}.wav'.format(idx, WAV_SUFFIX)) xml_file = open(xml_path, 'r', encoding='utf-8') soup = BeautifulSoup(xml_file, 'lxml') transcription = soup.recording.cleaned_sentence.string transcription_raw = soup.recording.sentence.string gender = soup.recording.gender.string is_native = soup.recording.muttersprachler.string age_class = soup.recording.ageclass.string speaker_idx = soup.recording.speaker_id.string if speaker_idx not in corpus.issuers.keys(): start_age_class = int(age_class.split('-')[0]) if start_age_class < 12: age_group = assets.AgeGroup.CHILD elif start_age_class < 18: age_group = assets.AgeGroup.YOUTH elif start_age_class < 65: age_group = assets.AgeGroup.ADULT else: age_group = assets.AgeGroup.SENIOR native_lang = None if is_native == 'Ja': native_lang = 'deu' issuer = assets.Speaker(speaker_idx, gender=assets.Gender(gender), age_group=age_group, native_language=native_lang) corpus.import_issuers(issuer) corpus.new_file(wav_path, idx) utt = corpus.new_utterance(idx, idx, speaker_idx) utt.set_label_list( assets.LabelList.create_single( transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) utt.set_label_list( assets.LabelList.create_single( transcription_raw, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW))
def _create_or_get_speech_issuer(corpus, file_idx, annotations): if file_idx not in annotations: return None issuer = assets.Speaker(file_idx) if file_idx in annotations: if annotations[file_idx][0] == 'm': issuer.gender = assets.Gender.MALE elif annotations[file_idx][0] == 'f': issuer.gender = assets.Gender.FEMALE corpus.import_issuers(issuer) return file_idx
def parse_speaker_info(readme_path): """ Parse speaker info and return tuple (idx, gender). """ idx = None gender = assets.Gender.UNKNOWN age_group = assets.AgeGroup.UNKNOWN native_lang = None with open(readme_path, 'r', errors='ignore') as f: for raw_line in f: line = raw_line.strip() if line is not None and line is not '': line = line.rstrip(';.') parts = line.split(':', maxsplit=1) if len(parts) > 1: key = parts[0].strip().lower() value = parts[1].strip() if key == 'user name': idx = value value = value.lower() if key == 'gender': if value in ['männlich', 'male', 'mnnlich']: gender = assets.Gender.MALE elif value in ['weiblich', 'female', '[female]']: gender = assets.Gender.FEMALE if key == 'age range': if value in ['erwachsener', 'adult', '[adult]', '[erwachsener]']: age_group = assets.AgeGroup.ADULT elif value in ['senior', '[senior']: age_group = assets.AgeGroup.SENIOR elif value in ['youth', 'jugendlicher', '[youth]', '[jugendlicher]']: age_group = assets.AgeGroup.YOUTH elif value in ['kind', 'child']: age_group = assets.AgeGroup.CHILD if key == 'language': if value in ['de', 'ger', 'deu', '[de]']: native_lang = 'deu' elif value in ['en', 'eng', '[en]']: native_lang = 'eng' return assets.Speaker(idx, gender=gender, age_group=age_group, native_language=native_lang)
def _load(self, path): corpus = audiomate.Corpus(path=path) for file_path in glob.glob(os.path.join(path, 'recordings', '*.wav')): file_idx = os.path.splitext(os.path.basename(file_path))[0] corpus.new_file(file_path, file_idx) idx_parts = file_idx.split('_') digit = idx_parts[0] issuer_idx = '_'.join(idx_parts[1:-1]) if issuer_idx not in corpus.issuers.keys(): issuer = assets.Speaker(issuer_idx) corpus.import_issuers(issuer) utterance = corpus.new_utterance(file_idx, file_idx, issuer_idx) utterance.set_label_list( assets.LabelList.create_single( str(digit), idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) return corpus
def _load(self, path): corpus = audiomate.Corpus(path=path) for part in ['TEST', 'TRAIN']: part_path = os.path.join(path, part) part_utt_ids = set() for region in os.listdir(part_path): region_path = os.path.join(part_path, region) if os.path.isdir(region_path): for speaker_abbr in os.listdir(region_path): speaker_path = os.path.join(region_path, speaker_abbr) speaker_idx = speaker_abbr[1:] if speaker_idx not in corpus.issuers.keys(): issuer = assets.Speaker(speaker_idx) if speaker_abbr[:1] == 'M': issuer.gender = assets.Gender.MALE elif speaker_abbr[:1] == 'F': issuer.gender = assets.Gender.FEMALE corpus.import_issuers(issuer) for wav_path in glob.glob( os.path.join(speaker_path, '*.WAV')): sentence_idx = os.path.splitext( os.path.basename(wav_path))[0] utt_idx = '{}-{}-{}'.format( region, speaker_abbr, sentence_idx).lower() part_utt_ids.add(utt_idx) raw_text_path = os.path.join( speaker_path, '{}.TXT'.format(sentence_idx)) raw_text = textfile.read_separated_lines( raw_text_path, separator=' ', max_columns=3)[0][2] words_path = os.path.join( speaker_path, '{}.WRD'.format(sentence_idx)) words = textfile.read_separated_lines( words_path, separator=' ', max_columns=3) phones_path = os.path.join( speaker_path, '{}.PHN'.format(sentence_idx)) phones = textfile.read_separated_lines( phones_path, separator=' ', max_columns=3) corpus.new_file(wav_path, utt_idx) utt = corpus.new_utterance(utt_idx, utt_idx, speaker_idx) raw_ll = assets.LabelList.create_single( raw_text, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW) utt.set_label_list(raw_ll) word_ll = assets.LabelList( idx=audiomate.corpus.LL_WORD_TRANSCRIPT) for record in words: start = int(record[0]) / 16000 end = int(record[1]) / 16000 word_ll.append( assets.Label(record[2], start=start, end=end)) utt.set_label_list(word_ll) phone_ll = assets.LabelList( idx=audiomate.corpus.LL_PHONE_TRANSCRIPT) for record in phones: start = int(record[0]) / 16000 end = int(record[1]) / 16000 phone_ll.append( assets.Label(record[2], start=start, end=end)) utt.set_label_list(phone_ll) filter = subset.MatchingUtteranceIdxFilter( utterance_idxs=part_utt_ids) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(part, subview) return corpus
def create_dataset(): temp_path = tempfile.mkdtemp() ds = audiomate.Corpus(temp_path) wav_1_path = sample_wav_file('wav_1.wav') wav_2_path = sample_wav_file('wav_2.wav') wav_3_path = sample_wav_file('wav_3.wav') wav_4_path = sample_wav_file('wav_4.wav') file_1 = ds.new_file(wav_1_path, file_idx='wav-1') file_2 = ds.new_file(wav_2_path, file_idx='wav_2') file_3 = ds.new_file(wav_3_path, file_idx='wav_3') file_4 = ds.new_file(wav_4_path, file_idx='wav_4') issuer_1 = assets.Speaker('spk-1', gender=assets.Gender.MALE) issuer_2 = assets.Speaker('spk-2', gender=assets.Gender.FEMALE) issuer_3 = assets.Issuer('spk-3') ds.import_issuers([issuer_1, issuer_2, issuer_3]) utt_1 = ds.new_utterance('utt-1', file_1.idx, issuer_idx=issuer_1.idx) utt_2 = ds.new_utterance('utt-2', file_2.idx, issuer_idx=issuer_1.idx) utt_3 = ds.new_utterance('utt-3', file_3.idx, issuer_idx=issuer_2.idx, start=0, end=1.5) utt_4 = ds.new_utterance('utt-4', file_3.idx, issuer_idx=issuer_2.idx, start=1.5, end=2.5) utt_5 = ds.new_utterance('utt-5', file_4.idx, issuer_idx=issuer_3.idx) utt_1.set_label_list( assets.LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[assets.Label('who am i')])) utt_2.set_label_list( assets.LabelList( audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[assets.Label('who are you', meta={ 'a': 'hey', 'b': 2 })])) utt_3.set_label_list( assets.LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[assets.Label('who is he')])) utt_4.set_label_list( assets.LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[assets.Label('who are they')])) utt_5.set_label_list( assets.LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[assets.Label('who is she')])) train_filter = subview.MatchingUtteranceIdxFilter( utterance_idxs={'utt-1', 'utt-2', 'utt-3'}) sv_train = subview.Subview(ds, filter_criteria=[train_filter]) dev_filter = subview.MatchingUtteranceIdxFilter( utterance_idxs={'utt-4', 'utt-5'}) sv_dev = subview.Subview(ds, filter_criteria=[dev_filter]) ds.import_subview('train', sv_train) ds.import_subview('dev', sv_dev) ds.new_feature_container('mfcc', '/some/dummy/path') ds.new_feature_container('mel', '/some/dummy/path_mel') return ds