def extract_segment_infos(index_file, transcript_file, src_rate, language): # segmentation = collect_segmentation(segmentation_file) speeches = collect_speeches(index_file) transcript = Path(transcript_file).read_text(encoding='utf-8') # merge information from index file (speech parts) with segmentation information segment_infos = [] for speech_meta in speeches: start_text = speech_meta['start_text'] end_text = speech_meta['end_text'] + 1 # komische Indizierung speech_transcript = normalize(transcript[start_text:end_text], language) if len(speech_transcript.strip()) == 0: continue segment_infos.append({ 'start_frame': resample_frame(speech_meta['start_frame'], src_rate=src_rate), 'end_frame': resample_frame(speech_meta['end_frame'], src_rate=src_rate), 'transcript': speech_transcript }) return segment_infos
def extract_segment_infos(segments_file, transcript_file): transcripts = {} with open(transcript_file, 'r') as f_transcript: for line in f_transcript.readlines(): segment_id, transcript = line.split(' ', 1) transcripts[segment_id] = transcript.replace('\n', '') line_pattern = re.compile( '(?P<segment_id>.*)\s(?P<segment_start>.*)\s(?P<segment_end>.*)\n') segment_infos = [] with open(segments_file, 'r') as f_segments: lines = f_segments.readlines() for i, line in enumerate(lines): result = re.search(line_pattern, line) if result: segment_id = result.group('segment_id') segment_infos.append({ 'start_frame': seconds_to_frame(result.group('segment_start')), 'end_frame': seconds_to_frame(result.group('segment_end')), 'transcript': normalize(transcripts[segment_id], 'en') if segment_id in transcripts else '' }) return segment_infos
def preprocess(audio_path, transcript_path, language=None, norm_transcript=False): """ Pipeline Stage 1: Preprocessing This stage prepares the input by converting it to the expected format and normalizing it :param audio_path: path to a MP3 or WAV file containing the speech recording :param transcript_path: path to a text file containing the transcript for the audio file :param language: (optional) hint for a language. If not set the language is detected from the transcript. :param norm_transcript: normalize transcript :return: raw audio bytes: the audio samples as bytes array (mono, PCM-16) sample rate: number of samples per second (usually 16'000) transcript: normalized transcript (normalization depends on language!) language: inferred language (if argument was omitted), else unchanged argument """ print( """PIPELINE STAGE #1 (preprocessing): Converting audio to 16-bit PCM wave and normalizing transcript""" ) extension = splitext(audio_path)[-1] if extension not in ['.wav', '.mp3']: raise ValueError(f'ERROR: can only handle MP3 and WAV files!') if extension == '.mp3': print(f'converting {audio_path}') tmp_file = 'tmp.wav' to_wav(audio_path, tmp_file) audio_bytes, rate = read_pcm16_wave(tmp_file) remove(tmp_file) else: audio_bytes, rate = read_pcm16_wave(audio_path) if rate is not 16000: print(f'Resampling from {rate}Hz to 16.000Hz/mono') # audio, rate = librosa.load(audio_path, sr=16000, mono=True) tmp_file = 'tmp.wav' to_wav(audio_path, tmp_file) # write_pcm16_wave(tmp_file, audio, rate) audio_bytes, rate = read_pcm16_wave(tmp_file) remove(tmp_file) with open(transcript_path, 'r') as f: transcript = f.read() if norm_transcript: transcript = normalize(transcript, language) if not language: language = langdetect.detect(transcript) print(f'detected language from transcript: {language}') print( f"""STAGE #1 COMPLETED: Got {len(audio_bytes)} audio samples and {len(transcript)} labels""" ) return audio_bytes, rate, transcript, language
def test_normalize(self): assert_that(normalize('Foo', 'en'), is_('foo'), 'normalization should make text lowercase') assert_that(normalize('Foo, bar', 'en'), is_('foo bar'), 'normalization should remove punctuation') assert_that(normalize('$Foo, bar!', 'en'), is_('foo bar'), 'normalization should strip leading/trailing spaces') assert_that(normalize('Färöer Straße', 'en'), is_('faroer strasse'), 'English should only consider ASCII') assert_that(normalize('Färöer Straße', 'de'), is_('färöer strasse'), 'German should also consider umlauts') assert_that(normalize("won't doesn't", 'en'), is_("won't doesn't"), 'English should keep apostrophe') assert_that(normalize("won't doesn't", 'de'), is_("won t doesn t"), 'German should remove apostrophe')
def main(args): print(create_args_str(args)) target_dir, keras_path, lm_path, vocab_path, gpu = setup(args) print(f'all results will be written to {target_dir}') lm = load_lm(lm_path) if lm_path else None vocab = load_vocab(vocab_path) if vocab_path else None corpus = get_corpus('rl', 'de') corpus.summary() test_entries = list(set((segment.entry for segment in corpus.test_set()))) # add 6 entries from PodClub corpus corpus = get_corpus('pc', 'de') corpus.summary() test_entries += [ corpus['record1058'], corpus['record1063'], corpus['record1076'], corpus['record1523'], corpus['record1548'], corpus['record1556'] ] stats = [] for i, entry in enumerate(test_entries): print(f'entry {i + 1}/{len(test_entries)}') audio_file = entry.audio_path sample_rate = entry.rate with open(entry.transcript_path, encoding='utf-8') as f: transcript = f.read() if args.norm_transcript: transcript = normalize(transcript, 'de') demo_id = splitext(basename(audio_file))[0] target_dir_entry = join(target_dir, demo_id) if not exists(target_dir_entry): makedirs(target_dir_entry) voiced_segments = [ Voice(s.audio, s.rate, s.start_frame, s.end_frame) for s in entry ] df_alignments = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript, language='de', keras_path=keras_path, lm=lm, vocab=vocab, force_realignment=args.force_realignment, align_endings=args.align_endings, target_dir=target_dir_entry) df_stats = calculate_stats(df_alignments, keras_path, transcript) # calculate average similarity between Keras-alignment and original aligments original_alignments = [s.transcript for s in entry.segments] av_similarity = np.mean([ levenshtein_similarity(ka, oa) for (ka, oa) in zip(df_alignments['alignment'], original_alignments) ]) df_stats['similarity'] = av_similarity create_demo_files(target_dir_entry, audio_file, transcript, df_alignments, df_stats) stats.append(df_stats) df_keras = pd.concat(stats) csv_keras = join(target_dir, 'performance.csv') df_keras.to_csv(csv_keras) print(f'summary saved to {csv_keras}') visualize_pipeline_performance(csv_keras, csv_ds=None, silent=True) update_index(target_dir, lang='de', num_aligned=len(test_entries), df_keras=df_keras, keras_path=keras_path, lm_path=lm_path, vocab_path=vocab_path) K.clear_session()
def create_segments(source_dir, target_dir, limit): audio_root = join(source_dir, 'audio') books_root = join(source_dir, 'books') chapters_file = find_file_by_suffix(audio_root, 'CHAPTERS.TXT') chapters = collect_chapter_meta(chapters_file) books = collect_book_texts(books_root) directories = [ root for root, subdirs, files in walk(audio_root) if not subdirs and basename(root) in chapters.keys() ][:limit] progress = tqdm(directories, total=min(len(directories), limit or math.inf), file=sys.stderr, unit='entries') segments = [] for source_dir in progress: progress.set_description(f'{source_dir:{100}}') chapter_id = basename(source_dir) speaker_id = basename(abspath(join(source_dir, pardir))) book_id = chapters[chapter_id]['book_id'] if chapter_id not in chapters: print( f'WARNING: chapter {chapter_id} unknown or not in train-clean-xxx. Skipping corpus entry...' ) continue if not book_id: print( f'WARNING: no book information available for chapter {chapter_id}. Skipping corpus entry...' ) continue if book_id not in books: print( f'WARNING: no book text available for chapter {chapter_id}. Skipping corpus entry...' ) continue segments_file = find_file_by_suffix( source_dir, f'{speaker_id}-{chapter_id}.seg.txt') if not segments_file: print( f'no segmentation found at {segments_file}. Skipping corpus entry...' ) continue transcript_file = find_file_by_suffix( source_dir, f'{speaker_id}-{chapter_id}.trans.txt') if not transcript_file: print( f'no transcript found at {transcript_file}. Skipping corpus entry...' ) continue mp3_file = find_file_by_suffix(source_dir, f'{chapter_id}.mp3') if not mp3_file: print(f'no MP3 file found at {mp3_file}. Skipping corpus entry...') continue segment_infos = extract_segment_infos(segments_file, transcript_file) crop_start, crop_end = crop_segments(segment_infos) # resample audio if necessary wav_file = join(target_dir, basename(splitext(mp3_file)[0] + ".wav")) if not exists(wav_file) or args.overwrite: resample(mp3_file, wav_file, crop_start, crop_end) # write full transcript with open(join(target_dir, f'{chapter_id}.txt'), 'w') as f: book_text = normalize(books[book_id], 'en') first_transcript = segment_infos[0]['transcript'] if first_transcript in book_text: text_start = book_text.index(first_transcript) else: text_start = 0 # try to find the first transcript by searching the maximum substring from the left for i in range(1, len(first_transcript) - 1): if first_transcript[:i] not in book_text: text_start = book_text.index(first_transcript[:i - 1]) break last_transcript = segment_infos[-1]['transcript'] if last_transcript in book_text: text_end = book_text.index(last_transcript) + len( last_transcript) else: # try to find last transcript by searching maximum substring from the right text_end = len(book_text) - 1 for i in range(1, len(last_transcript) - 1): if last_transcript[-i:] not in book_text: text_end = book_text.index( last_transcript[-i + 1:]) + i - 1 break f.write(book_text[text_start:text_end]) # create segments for segment_info in segment_infos: entry_id = chapter_id subset = chapters[chapter_id]['subset'] audio_file = basename(wav_file) start_frame = segment_info['start_frame'] end_frame = segment_info['end_frame'] transcript = segment_info['transcript'] duration = (end_frame - start_frame) / 16000 numeric = contains_numeric(transcript) segments.append([ entry_id, subset, 'en', audio_file, start_frame, end_frame, duration, transcript, numeric ]) columns = [ 'entry_id', 'subset', 'language', 'audio_file', 'start_frame', 'end_frame', 'duration', 'transcript', 'numeric' ] return pd.DataFrame(segments, columns=columns)