示例#1
0
def extract_segment_infos(index_file, transcript_file, src_rate, language):
    # segmentation = collect_segmentation(segmentation_file)
    speeches = collect_speeches(index_file)
    transcript = Path(transcript_file).read_text(encoding='utf-8')

    # merge information from index file (speech parts) with segmentation information
    segment_infos = []
    for speech_meta in speeches:
        start_text = speech_meta['start_text']
        end_text = speech_meta['end_text'] + 1  # komische Indizierung
        speech_transcript = normalize(transcript[start_text:end_text],
                                      language)
        if len(speech_transcript.strip()) == 0:
            continue

        segment_infos.append({
            'start_frame':
            resample_frame(speech_meta['start_frame'], src_rate=src_rate),
            'end_frame':
            resample_frame(speech_meta['end_frame'], src_rate=src_rate),
            'transcript':
            speech_transcript
        })

    return segment_infos
示例#2
0
def extract_segment_infos(segments_file, transcript_file):
    transcripts = {}
    with open(transcript_file, 'r') as f_transcript:
        for line in f_transcript.readlines():
            segment_id, transcript = line.split(' ', 1)
            transcripts[segment_id] = transcript.replace('\n', '')

    line_pattern = re.compile(
        '(?P<segment_id>.*)\s(?P<segment_start>.*)\s(?P<segment_end>.*)\n')

    segment_infos = []
    with open(segments_file, 'r') as f_segments:
        lines = f_segments.readlines()
        for i, line in enumerate(lines):
            result = re.search(line_pattern, line)
            if result:
                segment_id = result.group('segment_id')

                segment_infos.append({
                    'start_frame':
                    seconds_to_frame(result.group('segment_start')),
                    'end_frame':
                    seconds_to_frame(result.group('segment_end')),
                    'transcript':
                    normalize(transcripts[segment_id], 'en')
                    if segment_id in transcripts else ''
                })
    return segment_infos
示例#3
0
def preprocess(audio_path,
               transcript_path,
               language=None,
               norm_transcript=False):
    """
    Pipeline Stage 1: Preprocessing
    This stage prepares the input by converting it to the expected format and normalizing it

    :param audio_path: path to a MP3 or WAV file containing the speech recording
    :param transcript_path: path to a text file containing the transcript for the audio file
    :param language: (optional) hint for a language. If not set the language is detected from the transcript.
    :param norm_transcript: normalize transcript
    :return:
        raw audio bytes: the audio samples as bytes array (mono, PCM-16)
        sample rate: number of samples per second (usually 16'000)
        transcript: normalized transcript (normalization depends on language!)
        language: inferred language (if argument was omitted), else unchanged argument
    """
    print(
        """PIPELINE STAGE #1 (preprocessing): Converting audio to 16-bit PCM wave and normalizing transcript"""
    )
    extension = splitext(audio_path)[-1]
    if extension not in ['.wav', '.mp3']:
        raise ValueError(f'ERROR: can only handle MP3 and WAV files!')

    if extension == '.mp3':
        print(f'converting {audio_path}')
        tmp_file = 'tmp.wav'
        to_wav(audio_path, tmp_file)
        audio_bytes, rate = read_pcm16_wave(tmp_file)
        remove(tmp_file)
    else:
        audio_bytes, rate = read_pcm16_wave(audio_path)

    if rate is not 16000:
        print(f'Resampling from {rate}Hz to 16.000Hz/mono')
        # audio, rate = librosa.load(audio_path, sr=16000, mono=True)
        tmp_file = 'tmp.wav'
        to_wav(audio_path, tmp_file)
        # write_pcm16_wave(tmp_file, audio, rate)
        audio_bytes, rate = read_pcm16_wave(tmp_file)
        remove(tmp_file)

    with open(transcript_path, 'r') as f:
        transcript = f.read()
        if norm_transcript:
            transcript = normalize(transcript, language)

    if not language:
        language = langdetect.detect(transcript)
        print(f'detected language from transcript: {language}')

    print(
        f"""STAGE #1 COMPLETED: Got {len(audio_bytes)} audio samples and {len(transcript)} labels"""
    )
    return audio_bytes, rate, transcript, language
示例#4
0
    def test_normalize(self):
        assert_that(normalize('Foo', 'en'), is_('foo'),
                    'normalization should make text lowercase')
        assert_that(normalize('Foo, bar', 'en'), is_('foo bar'),
                    'normalization should remove punctuation')
        assert_that(normalize('$Foo, bar!', 'en'), is_('foo bar'),
                    'normalization should strip leading/trailing spaces')

        assert_that(normalize('Färöer Straße', 'en'), is_('faroer strasse'),
                    'English should only consider ASCII')
        assert_that(normalize('Färöer Straße', 'de'), is_('färöer strasse'),
                    'German should also consider umlauts')

        assert_that(normalize("won't doesn't", 'en'), is_("won't doesn't"),
                    'English should keep apostrophe')
        assert_that(normalize("won't doesn't", 'de'), is_("won t doesn t"),
                    'German should remove apostrophe')
示例#5
0
def main(args):
    print(create_args_str(args))
    target_dir, keras_path, lm_path, vocab_path, gpu = setup(args)
    print(f'all results will be written to {target_dir}')

    lm = load_lm(lm_path) if lm_path else None
    vocab = load_vocab(vocab_path) if vocab_path else None

    corpus = get_corpus('rl', 'de')
    corpus.summary()
    test_entries = list(set((segment.entry for segment in corpus.test_set())))
    # add 6 entries from PodClub corpus
    corpus = get_corpus('pc', 'de')
    corpus.summary()
    test_entries += [
        corpus['record1058'], corpus['record1063'], corpus['record1076'],
        corpus['record1523'], corpus['record1548'], corpus['record1556']
    ]
    stats = []
    for i, entry in enumerate(test_entries):
        print(f'entry {i + 1}/{len(test_entries)}')
        audio_file = entry.audio_path
        sample_rate = entry.rate
        with open(entry.transcript_path, encoding='utf-8') as f:
            transcript = f.read()
            if args.norm_transcript:
                transcript = normalize(transcript, 'de')

        demo_id = splitext(basename(audio_file))[0]
        target_dir_entry = join(target_dir, demo_id)
        if not exists(target_dir_entry):
            makedirs(target_dir_entry)

        voiced_segments = [
            Voice(s.audio, s.rate, s.start_frame, s.end_frame) for s in entry
        ]
        df_alignments = pipeline(voiced_segments=voiced_segments,
                                 sample_rate=sample_rate,
                                 transcript=transcript,
                                 language='de',
                                 keras_path=keras_path,
                                 lm=lm,
                                 vocab=vocab,
                                 force_realignment=args.force_realignment,
                                 align_endings=args.align_endings,
                                 target_dir=target_dir_entry)

        df_stats = calculate_stats(df_alignments, keras_path, transcript)

        # calculate average similarity between Keras-alignment and original aligments
        original_alignments = [s.transcript for s in entry.segments]
        av_similarity = np.mean([
            levenshtein_similarity(ka, oa)
            for (ka,
                 oa) in zip(df_alignments['alignment'], original_alignments)
        ])
        df_stats['similarity'] = av_similarity
        create_demo_files(target_dir_entry, audio_file, transcript,
                          df_alignments, df_stats)

        stats.append(df_stats)

    df_keras = pd.concat(stats)
    csv_keras = join(target_dir, 'performance.csv')
    df_keras.to_csv(csv_keras)
    print(f'summary saved to {csv_keras}')

    visualize_pipeline_performance(csv_keras, csv_ds=None, silent=True)
    update_index(target_dir,
                 lang='de',
                 num_aligned=len(test_entries),
                 df_keras=df_keras,
                 keras_path=keras_path,
                 lm_path=lm_path,
                 vocab_path=vocab_path)
    K.clear_session()
示例#6
0
def create_segments(source_dir, target_dir, limit):
    audio_root = join(source_dir, 'audio')
    books_root = join(source_dir, 'books')

    chapters_file = find_file_by_suffix(audio_root, 'CHAPTERS.TXT')
    chapters = collect_chapter_meta(chapters_file)

    books = collect_book_texts(books_root)

    directories = [
        root for root, subdirs, files in walk(audio_root)
        if not subdirs and basename(root) in chapters.keys()
    ][:limit]
    progress = tqdm(directories,
                    total=min(len(directories), limit or math.inf),
                    file=sys.stderr,
                    unit='entries')

    segments = []
    for source_dir in progress:
        progress.set_description(f'{source_dir:{100}}')

        chapter_id = basename(source_dir)
        speaker_id = basename(abspath(join(source_dir, pardir)))
        book_id = chapters[chapter_id]['book_id']

        if chapter_id not in chapters:
            print(
                f'WARNING: chapter {chapter_id} unknown or not in train-clean-xxx. Skipping corpus entry...'
            )
            continue

        if not book_id:
            print(
                f'WARNING: no book information available for chapter {chapter_id}. Skipping corpus entry...'
            )
            continue

        if book_id not in books:
            print(
                f'WARNING: no book text available for chapter {chapter_id}. Skipping corpus entry...'
            )
            continue

        segments_file = find_file_by_suffix(
            source_dir, f'{speaker_id}-{chapter_id}.seg.txt')
        if not segments_file:
            print(
                f'no segmentation found at {segments_file}. Skipping corpus entry...'
            )
            continue

        transcript_file = find_file_by_suffix(
            source_dir, f'{speaker_id}-{chapter_id}.trans.txt')
        if not transcript_file:
            print(
                f'no transcript found at {transcript_file}. Skipping corpus entry...'
            )
            continue

        mp3_file = find_file_by_suffix(source_dir, f'{chapter_id}.mp3')
        if not mp3_file:
            print(f'no MP3 file found at {mp3_file}. Skipping corpus entry...')
            continue

        segment_infos = extract_segment_infos(segments_file, transcript_file)
        crop_start, crop_end = crop_segments(segment_infos)

        # resample audio if necessary
        wav_file = join(target_dir, basename(splitext(mp3_file)[0] + ".wav"))
        if not exists(wav_file) or args.overwrite:
            resample(mp3_file, wav_file, crop_start, crop_end)

        # write full transcript
        with open(join(target_dir, f'{chapter_id}.txt'), 'w') as f:
            book_text = normalize(books[book_id], 'en')
            first_transcript = segment_infos[0]['transcript']
            if first_transcript in book_text:
                text_start = book_text.index(first_transcript)
            else:
                text_start = 0
                # try to find the first transcript by searching the maximum substring from the left
                for i in range(1, len(first_transcript) - 1):
                    if first_transcript[:i] not in book_text:
                        text_start = book_text.index(first_transcript[:i - 1])
                        break

            last_transcript = segment_infos[-1]['transcript']
            if last_transcript in book_text:
                text_end = book_text.index(last_transcript) + len(
                    last_transcript)
            else:
                # try to find last transcript by searching maximum substring from the right
                text_end = len(book_text) - 1
                for i in range(1, len(last_transcript) - 1):
                    if last_transcript[-i:] not in book_text:
                        text_end = book_text.index(
                            last_transcript[-i + 1:]) + i - 1
                        break

            f.write(book_text[text_start:text_end])

        # create segments
        for segment_info in segment_infos:
            entry_id = chapter_id
            subset = chapters[chapter_id]['subset']
            audio_file = basename(wav_file)
            start_frame = segment_info['start_frame']
            end_frame = segment_info['end_frame']
            transcript = segment_info['transcript']
            duration = (end_frame - start_frame) / 16000
            numeric = contains_numeric(transcript)
            segments.append([
                entry_id, subset, 'en', audio_file, start_frame, end_frame,
                duration, transcript, numeric
            ])

    columns = [
        'entry_id', 'subset', 'language', 'audio_file', 'start_frame',
        'end_frame', 'duration', 'transcript', 'numeric'
    ]
    return pd.DataFrame(segments, columns=columns)