Пример #1
0
def segment_and_asr(audio_path):
    """
    Reads the audio file, calculates log-probabilities of tokens using a trained nemo model and segments the audio file into smaller sections

    :returns
    log_probabilities of tokens in individual frames
    step size of frames (in seconds)
    vocabulary (list of tokens)
    list of segments in the audio file (start, end in seconds)
    """

    sr = sox.file_info.sample_rate(audio_path)
    nsamples = np.round(sox.file_info.duration(audio_path) * sr)
    _, log_prob, vocab = infer(model_path, [audio_path], 1)

    step = np.round(nsamples / len(log_prob[0]))

    log_prob_step_sec = step / sr
    # Run VAD on the input file
    frames = read_frames_from_file(audio_path,
                                   model_format,
                                   frame_duration_ms=20)
    split = vad_split(frames, model_format, threshold=0.5, aggressiveness=2)
    segments = []
    for i, segment in enumerate(split):
        segment_buffer, time_start, time_end = segment
        time_length = time_end - time_start
        if stt_min_duration_ms and time_length < stt_min_duration_ms:
            print('Fragment {}: Audio too short for STT'.format(i))
            continue
        if stt_max_duration_ms and time_length > stt_max_duration_ms:
            print('Fragment {}: Audio too long for STT'.format(i))
            continue
        segments.append((time_start, time_end))

    return log_prob[0], log_prob_step_sec, vocab, segments
Пример #2
0
def main():
    # Debug helpers
    logging.basicConfig()
    logging.root.setLevel(args.loglevel if args.loglevel else 20)

    def progress(it=None, desc='Processing', total=None):
        logging.info(desc)
        return it if args.no_progress else log_progress(
            it, interval=args.progress_interval, total=total)

    def resolve(base_path, spec_path):
        if spec_path is None:
            return None
        if not path.isabs(spec_path):
            spec_path = path.join(base_path, spec_path)
        return spec_path

    def exists(file_path):
        if file_path is None:
            return False
        return os.path.isfile(file_path)

    to_prepare = []

    def enqueue_or_fail(audio, tlog, script, aligned, prefix=''):
        if exists(aligned) and not args.force:
            fail(
                prefix +
                'Alignment file "{}" already existing - use --force to overwrite'
                .format(aligned))
        if tlog is None:
            if args.ignore_missing:
                return
            fail(prefix + 'Missing transcription log path')
        if not exists(audio) and not exists(tlog):
            if args.ignore_missing:
                return
            fail(prefix +
                 'Both audio file "{}" and transcription log "{}" are missing'.
                 format(audio, tlog))
        if not exists(script):
            if args.ignore_missing:
                return
            fail(prefix + 'Missing script "{}"'.format(script))
        to_prepare.append((audio, tlog, script, aligned))

    if (args.audio or
            args.tlog) and args.script and args.aligned and not args.catalog:
        enqueue_or_fail(args.audio, args.tlog, args.script, args.aligned)
    elif args.catalog:
        if not exists(args.catalog):
            fail('Unable to load catalog file "{}"'.format(args.catalog))
        catalog = path.abspath(args.catalog)
        catalog_dir = path.dirname(catalog)
        with open(catalog, 'r', encoding='utf-8') as catalog_file:
            catalog_entries = json.load(catalog_file)
        for entry in progress(catalog_entries, desc='Reading catalog'):
            enqueue_or_fail(
                resolve(catalog_dir, entry['audio']),
                resolve(catalog_dir, entry['tlog']),
                resolve(catalog_dir, entry['script']),
                resolve(catalog_dir, entry['aligned']),
                prefix='Problem loading catalog "{}" - '.format(catalog))
    else:
        fail(
            'You have to either specify a combination of "--audio/--tlog,--script,--aligned" or "--catalog"'
        )

    logging.debug('Start')

    to_align = []
    output_graph_path = None
    for audio_path, tlog_path, script_path, aligned_path in to_prepare:
        if not exists(tlog_path):
            generated_scorer = False
            if output_graph_path is None:
                logging.debug(
                    'Looking for model files in "{}"...'.format(model_dir))
                output_graph_path = glob(model_dir + "/*.pbmm")[0]
                lang_scorer_path = glob(model_dir + "/*.scorer")[0]
            kenlm_path = 'dependencies/kenlm/build/bin'
            if not path.exists(kenlm_path):
                kenlm_path = None
            deepspeech_path = 'dependencies/deepspeech'
            if not path.exists(deepspeech_path):
                deepspeech_path = None
            if kenlm_path and deepspeech_path and not args.stt_no_own_lm:
                tc = read_script(script_path)
                if not tc.clean_text.strip():
                    logging.error('Cleaned transcript is empty for {}'.format(
                        path.basename(script_path)))
                    continue
                clean_text_path = script_path + '.clean'
                with open(clean_text_path, 'w',
                          encoding='utf-8') as clean_text_file:
                    clean_text_file.write(tc.clean_text)

                scorer_path = script_path + '.scorer'
                if not path.exists(scorer_path):
                    # Generate LM
                    data_lower, vocab_str = convert_and_filter_topk(
                        scorer_path, clean_text_path, 500000)
                    build_lm(scorer_path, kenlm_path, 5, '85%', '0|0|1', True,
                             255, 8, 'trie', data_lower, vocab_str)
                    os.remove(scorer_path + '.' + 'lower.txt.gz')
                    os.remove(scorer_path + '.' + 'lm.arpa')
                    os.remove(scorer_path + '.' + 'lm_filtered.arpa')
                    os.remove(clean_text_path)

                    # Generate scorer
                    create_bundle(alphabet_path,
                                  scorer_path + '.' + 'lm.binary',
                                  scorer_path + '.' + 'vocab-500000.txt',
                                  scorer_path, False, 0.931289039105002,
                                  1.1834137581510284)
                    os.remove(scorer_path + '.' + 'lm.binary')
                    os.remove(scorer_path + '.' + 'vocab-500000.txt')

                    generated_scorer = True
            else:
                scorer_path = lang_scorer_path

            logging.debug(
                'Loading acoustic model from "{}", alphabet from "{}" and scorer from "{}"...'
                .format(output_graph_path, alphabet_path, scorer_path))

            # Run VAD on the input file
            logging.debug('Transcribing VAD segments...')
            frames = read_frames_from_file(audio_path, model_format,
                                           args.audio_vad_frame_length)
            segments = vad_split(frames,
                                 model_format,
                                 num_padding_frames=args.audio_vad_padding,
                                 threshold=args.audio_vad_threshold,
                                 aggressiveness=args.audio_vad_aggressiveness)

            def pre_filter():
                for i, segment in enumerate(segments):
                    segment_buffer, time_start, time_end = segment
                    time_length = time_end - time_start
                    if args.stt_min_duration and time_length < args.stt_min_duration:
                        logging.info(
                            'Fragment {}: Audio too short for STT'.format(i))
                        continue
                    if args.stt_max_duration and time_length > args.stt_max_duration:
                        logging.info(
                            'Fragment {}: Audio too long for STT'.format(i))
                        continue
                    yield (time_start, time_end,
                           np.frombuffer(segment_buffer, dtype=np.int16))

            samples = list(progress(pre_filter(), desc='VAD splitting'))

            pool = multiprocessing.Pool(initializer=init_stt,
                                        initargs=(output_graph_path,
                                                  scorer_path),
                                        processes=args.stt_workers)
            transcripts = list(
                progress(pool.imap(stt, samples),
                         desc='Transcribing',
                         total=len(samples)))

            fragments = []
            for time_start, time_end, segment_transcript in transcripts:
                if segment_transcript is None:
                    continue
                fragments.append({
                    'start': time_start,
                    'end': time_end,
                    'transcript': segment_transcript
                })
            logging.debug('Excluded {} empty transcripts'.format(
                len(transcripts) - len(fragments)))

            logging.debug(
                'Writing transcription log to file "{}"...'.format(tlog_path))
            with open(tlog_path, 'w', encoding='utf-8') as tlog_file:
                tlog_file.write(
                    json.dumps(fragments,
                               indent=4 if args.output_pretty else None,
                               ensure_ascii=False))

            # Remove scorer if generated
            if generated_scorer:
                os.remove(scorer_path)
        if not path.isfile(tlog_path):
            fail('Problem loading transcript from "{}"'.format(tlog_path))
        to_align.append((tlog_path, script_path, aligned_path))

    total_fragments = 0
    dropped_fragments = 0
    reasons = Counter()

    index = 0
    pool = multiprocessing.Pool(processes=args.align_workers)
    for aligned_file, file_total_fragments, file_dropped_fragments, file_reasons in \
            progress(pool.imap_unordered(align, to_align), desc='Aligning', total=len(to_align)):
        if args.no_progress:
            index += 1
            logging.info(
                'Aligned file {} of {} - wrote results to "{}"'.format(
                    index, len(to_align), aligned_file))
        total_fragments += file_total_fragments
        dropped_fragments += file_dropped_fragments
        reasons += file_reasons

    logging.info('Aligned {} fragments'.format(total_fragments))
    if total_fragments > 0 and dropped_fragments > 0:
        logging.info('Dropped {} fragments {:0.2f}%:'.format(
            dropped_fragments, dropped_fragments * 100.0 / total_fragments))
        for key, number in reasons.most_common():
            logging.info(' - {}: {}'.format(key, number))
Пример #3
0
def main():
    # Debug helpers
    logging.basicConfig()
    logging.root.setLevel(args.loglevel if args.loglevel else 20)

    def progress(it=None, desc="Processing", total=None):
        logging.info(desc)
        return (it if args.no_progress else log_progress(
            it, interval=args.progress_interval, total=total))

    def resolve(base_path, spec_path):
        if spec_path is None:
            return None
        if not path.isabs(spec_path):
            spec_path = path.join(base_path, spec_path)
        return spec_path

    def exists(file_path):
        if file_path is None:
            return False
        return os.path.isfile(file_path)

    to_prepare = []

    def enqueue_or_fail(audio, tlog, script, aligned, prefix=""):
        if exists(aligned) and not args.force:
            fail(
                prefix +
                'Alignment file "{}" already existing - use --force to overwrite'
                .format(aligned))
        if tlog is None:
            if args.ignore_missing:
                return
            fail(prefix + "Missing transcription log path")
        if not exists(audio) and not exists(tlog):
            if args.ignore_missing:
                return
            fail(prefix +
                 'Both audio file "{}" and transcription log "{}" are missing'.
                 format(audio, tlog))
        if not exists(script):
            if args.ignore_missing:
                return
            fail(prefix + 'Missing script "{}"'.format(script))
        to_prepare.append((audio, tlog, script, aligned))

    if (args.audio or
            args.tlog) and args.script and args.aligned and not args.catalog:
        enqueue_or_fail(args.audio, args.tlog, args.script, args.aligned)
    elif args.catalog:
        if not exists(args.catalog):
            fail('Unable to load catalog file "{}"'.format(args.catalog))
        catalog = path.abspath(args.catalog)
        catalog_dir = path.dirname(catalog)
        with open(catalog, "r", encoding="utf-8") as catalog_file:
            catalog_entries = json.load(catalog_file)
        for entry in progress(catalog_entries, desc="Reading catalog"):
            enqueue_or_fail(
                resolve(catalog_dir, entry["audio"]),
                resolve(catalog_dir, entry["tlog"]),
                resolve(catalog_dir, entry["script"]),
                resolve(catalog_dir, entry["aligned"]),
                prefix='Problem loading catalog "{}" - '.format(catalog),
            )
    else:
        fail(
            'You have to either specify a combination of "--audio/--tlog,--script,--aligned" or "--catalog"'
        )

    logging.debug("Start")

    to_align = []
    output_graph_path = None
    for audio_path, tlog_path, script_path, aligned_path in to_prepare:
        if not exists(tlog_path):  # or args.force:
            generated_scorer = False
            if output_graph_path is None:
                logging.debug(
                    'Looking for model files in "{}"...'.format(model_dir))
                output_graph_path = glob(model_dir + "/*.pbmm")[0]
                lang_scorer_path = glob(model_dir + "/*.scorer")[0]
            kenlm_path = "/install/kenlm/build/bin"
            deepspeech_path = "third_party/DeepSpeech"
            if args.per_document_lm:
                assert path.exists(kenlm_path)
                assert path.exists(deepspeech_path)

                scorer_path = script_path + ".scorer"
                if not path.exists(scorer_path):
                    data_lower, vocab_str = convert_and_filter_topk(
                        scorer_path, clean_text_path, 500000)
                    build_lm(
                        scorer_path,
                        kenlm_path,
                        5,
                        "85%",
                        "0|0|1",
                        True,
                        255,
                        8,
                        "trie",
                        data_lower,
                        vocab_str,
                    )
                    os.remove(scorer_path + "." + "lower.txt.gz")
                    os.remove(scorer_path + "." + "lm.arpa")
                    os.remove(scorer_path + "." + "lm_filtered.arpa")
                    os.remove(clean_text_path)

                    create_bundle(
                        alphabet_path,
                        scorer_path + "." + "lm.binary",
                        scorer_path + "." + "vocab-500000.txt",
                        scorer_path,
                        False,
                        0.931289039105002,
                        1.1834137581510284,
                    )
                    os.remove(scorer_path + "." + "lm.binary")
                    os.remove(scorer_path + "." + "vocab-500000.txt")

                tc = read_script(script_path)
                if not tc.clean_text.strip():
                    logging.error("Cleaned transcript is empty for {}".format(
                        path.basename(script_path)))
                    continue
                clean_text_path = script_path + ".clean"
                with open(clean_text_path, "w",
                          encoding="utf-8") as clean_text_file:
                    clean_text_file.write(tc.clean_text)

                generated_scorer = True
            else:
                scorer_path = lang_scorer_path

            logging.debug(
                'Loading acoustic model from "{}", alphabet from "{}" and scorer from "{}"...'
                .format(output_graph_path, alphabet_path, scorer_path))

            # Run VAD on the input file
            logging.debug("Transcribing VAD segments...")
            frames = read_frames_from_file(audio_path, model_format,
                                           args.audio_vad_frame_length)
            frames = list(frames)
            with open("dsalign_voiced_buffers.npy", "wb") as fh:
                np.save(fh, frames)
            segments = vad_split(
                frames,
                model_format,
                num_padding_frames=args.audio_vad_padding,
                threshold=args.audio_vad_threshold,
                aggressiveness=args.audio_vad_aggressiveness,
            )

            def pre_filter():
                for i, segment in enumerate(segments):
                    segment_buffer, time_start, time_end = segment
                    time_length = time_end - time_start
                    if args.stt_min_duration and time_length < args.stt_min_duration:
                        logging.info(
                            "Fragment {}: Audio too short for STT".format(i))
                        continue
                    if args.stt_max_duration and time_length > args.stt_max_duration:
                        logging.info(
                            "Fragment {}: Audio too long for STT".format(i))
                        continue
                    yield (
                        time_start,
                        time_end,
                        np.frombuffer(segment_buffer, dtype=np.int16),
                    )

            samples = list(progress(pre_filter(), desc="VAD splitting"))

            # It does multiprocessing on the individual chunks within
            # a particular document. This is not a great thing. Not
            # much parallelism were we to use a TPU or GPU.

            # multiprocessing pool. Need to replace this with a queue of some sort.
            pool = multiprocessing.Pool(
                initializer=init_stt,
                initargs=(output_graph_path, scorer_path),
                processes=args.stt_workers,
            )
            transcripts = list(
                progress(pool.imap(stt, samples),
                         desc="Transcribing",
                         total=len(samples)))

            fragments = []
            for time_start, time_end, segment_transcript in transcripts:
                if segment_transcript is None:
                    continue
                fragments.append({
                    "start": time_start,
                    "end": time_end,
                    "transcript": segment_transcript,
                })
            logging.debug("Excluded {} empty transcripts".format(
                len(transcripts) - len(fragments)))

            logging.debug(
                'Writing transcription log to file "{}"...'.format(tlog_path))
            with open(tlog_path, "w", encoding="utf-8") as tlog_file:
                tlog_file.write(
                    json.dumps(
                        fragments,
                        indent=4 if args.output_pretty else None,
                        ensure_ascii=False,
                    ))

            # Remove scorer if generated
            if generated_scorer:
                os.remove(scorer_path)
        if not path.isfile(tlog_path):
            fail('Problem loading transcript from "{}"'.format(tlog_path))
        to_align.append((tlog_path, script_path, aligned_path))

    total_fragments = 0
    dropped_fragments = 0
    reasons = Counter()

    index = 0
    pool = multiprocessing.Pool(processes=args.align_workers)
    for (
            aligned_file,
            file_total_fragments,
            file_dropped_fragments,
            file_reasons,
    ) in progress(pool.imap_unordered(align, to_align),
                  desc="Aligning",
                  total=len(to_align)):
        if args.no_progress:
            index += 1
            logging.info(
                'Aligned file {} of {} - wrote results to "{}"'.format(
                    index, len(to_align), aligned_file))
        total_fragments += file_total_fragments
        dropped_fragments += file_dropped_fragments
        reasons += file_reasons

    logging.info("Aligned {} fragments".format(total_fragments))
    if total_fragments > 0 and dropped_fragments > 0:
        logging.info("Dropped {} fragments {:0.2f}%:".format(
            dropped_fragments, dropped_fragments * 100.0 / total_fragments))
        for key, number in reasons.most_common():
            logging.info(" - {}: {}".format(key, number))
Пример #4
0
def main(audio_chunks_path, transcript_lst_path):
    # Debug helpers
    logging.basicConfig()
    logging.root.setLevel(args.loglevel if args.loglevel else 20)

    def progress(it=None, desc='Processing', total=None):
        logging.info(desc)
        return it if args.no_progress else log_progress(
            it, interval=args.progress_interval, total=total)

    def resolve(base_path, spec_path):
        if spec_path is None:
            return None
        if not path.isabs(spec_path):
            spec_path = path.join(base_path, spec_path)
        return spec_path

    def exists(file_path):
        if file_path is None:
            return False
        return os.path.isfile(file_path)

    to_prepare = []

    def enqueue_or_fail(audio, tlog, script, aligned, prefix=''):
        if exists(aligned) and not args.force:
            fail(
                prefix +
                'Alignment file "{}" already existing - use --force to overwrite'
                .format(aligned))
        if tlog is None:
            if args.ignore_missing:
                return
            fail(prefix + 'Missing transcription log path')
        if not exists(audio) and not exists(tlog):
            if args.ignore_missing:
                return
            fail(prefix +
                 'Both audio file "{}" and transcription log "{}" are missing'.
                 format(audio, tlog))
        if not exists(script):
            if args.ignore_missing:
                return
            fail(prefix + 'Missing script "{}"'.format(script))
        to_prepare.append((audio, tlog, script, aligned))

    if (args.audio or
            args.tlog) and args.script and args.aligned and not args.catalog:
        enqueue_or_fail(args.audio, args.tlog, args.script, args.aligned)
    elif args.catalog:
        if not exists(args.catalog):
            fail('Unable to load catalog file "{}"'.format(args.catalog))
        catalog = path.abspath(args.catalog)
        catalog_dir = path.dirname(catalog)
        with open(catalog, 'r', encoding='utf-8') as catalog_file:
            catalog_entries = json.load(catalog_file)
        for entry in progress(catalog_entries, desc='Reading catalog'):
            enqueue_or_fail(
                resolve(catalog_dir, entry['audio']),
                resolve(catalog_dir, entry['tlog']),
                resolve(catalog_dir, entry['script']),
                resolve(catalog_dir, entry['aligned']),
                prefix='Problem loading catalog "{}" - '.format(catalog))
    else:
        fail(
            'You have to either specify a combination of "--audio/--tlog,--script,--aligned" or "--catalog"'
        )

    logging.debug('Start')

    to_align = []

    for audio_path, tlog_path, script_path, aligned_path in to_prepare:
        if not exists(tlog_path):

            if not args.stt_no_own_lm:
                tc = read_script(script_path)
                if not tc.clean_text.strip():
                    logging.error('Cleaned transcript is empty for {}'.format(
                        path.basename(script_path)))
                    continue
                clean_text_path = script_path + '.clean'
                with open(clean_text_path, 'w',
                          encoding='utf-8') as clean_text_file:
                    clean_text_file.write(tc.clean_text)

            # Run VAD on the input file
            logging.debug('Transcribing VAD segments...')
            frames = read_frames_from_file(audio_path, model_format,
                                           args.audio_vad_frame_length)
            segments = vad_split(frames,
                                 model_format,
                                 num_padding_frames=args.audio_vad_padding,
                                 threshold=args.audio_vad_threshold,
                                 aggressiveness=args.audio_vad_aggressiveness)

            def pre_filter():
                for i, segment in enumerate(segments):
                    segment_buffer, time_start, time_end = segment
                    time_length = time_end - time_start
                    if args.stt_min_duration and time_length < args.stt_min_duration:
                        logging.info(
                            'Fragment {}: Audio too short for STT'.format(i))
                        continue
                    if args.stt_max_duration and time_length > args.stt_max_duration:
                        logging.info(
                            'Fragment {}: Audio too long for STT'.format(i))
                        continue
                    yield (time_start, time_end,
                           np.frombuffer(segment_buffer, dtype=np.int16))

            samples = list(progress(pre_filter(), desc='VAD splitting'))

            cnt = 1

            for time_start, time_end, audio in samples:
                wf.write(audio_chunks_path + '/test' + str(cnt) + '.wav',
                         16000, audio)

                if (cnt == 1):
                    f = open(transcript_lst_path + '/transcript.txt', 'w+')
                    f.write(
                        str(cnt) + ' ' + '/root/wav2letter/temp_audio/test' +
                        str(cnt) + '.wav' + ' ' +
                        str(np.random.randint(500, 1000, 1)[0]) +
                        ' Welcome to Glib')
                    f.close()
                    f = open(transcript_lst_path + '/transcript.txt', 'a')
                else:
                    f.write('\n' + str(cnt) + ' ' +
                            '/root/wav2letter/temp_audio/test' + str(cnt) +
                            '.wav' + ' ' +
                            str(np.random.randint(500, 1000, 1)[0]) +
                            ' Welcome to Glib')

                cnt += 1

            f.close()
            os.rename(transcript_lst_path + '/transcript.txt',
                      transcript_lst_path + '/transcript.lst')

            decoder_path = "./root/wav2letter/build/Decoder"
            cfg_path = "/root/wav2letter/recipes/models/streaming_convnets/librispeech/decode_500ms_right_future_ngram_other.cfg"

            os.system(
                '[ ! "$(docker ps -a | grep mycontainer)" ] && docker run -d --name mycontainer -i wav2letter-cpu-1'
            )
            os.system(
                "docker start mycontainer && docker cp " +
                transcript_lst_path +
                "/transcript.lst mycontainer:/root/wav2letter/lists/transcript.lst && docker cp -a "
                + audio_chunks_path +
                " mycontainer:/root/wav2letter && docker exec -ti mycontainer /bin/bash -c 'export LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2018.5.274/linux/mkl/lib/intel64_lin \n ./root/wav2letter/build/Decoder --flagsfile=/root/wav2letter/recipes/models/streaming_convnets/librispeech/decode_500ms_right_future_ngram_other.cfg \n rm -r /root/wav2letter/temp_audio \n exit' && docker cp mycontainer:/root/wav2letter/lists/transcript.lst.hyp "
                + transcript_lst_path +
                "/transcript.lst.hyp && docker stop mycontainer")

            decoder_trans = []

            with open(transcript_lst_path + '/transcript.lst.hyp', 'r') as f:
                for line in f:
                    decoder_trans.insert(0, line[:-3])

            cnt = 0
            transcripts = []
            for time_start, time_end, audio in samples:
                tup = tuple([time_start, time_end, decoder_trans[cnt]])
                cnt += 1
                transcripts.append(tup)

            fragments = []
            for time_start, time_end, segment_transcript in transcripts:
                if segment_transcript is None:
                    continue
                fragments.append({
                    'start': time_start,
                    'end': time_end,
                    'transcript': segment_transcript
                })
            logging.debug('Excluded {} empty transcripts'.format(
                len(transcripts) - len(fragments)))

            logging.debug(
                'Writing transcription log to file "{}"...'.format(tlog_path))
            with open(tlog_path, 'w', encoding='utf-8') as tlog_file:
                tlog_file.write(
                    json.dumps(fragments,
                               indent=4 if args.output_pretty else None,
                               ensure_ascii=False))

        if not path.isfile(tlog_path):
            fail('Problem loading transcript from "{}"'.format(tlog_path))
        to_align.append((tlog_path, script_path, aligned_path))

    total_fragments = 0
    dropped_fragments = 0
    reasons = Counter()

    index = 0
    pool = multiprocessing.Pool(processes=args.align_workers)
    for aligned_file, file_total_fragments, file_dropped_fragments, file_reasons in \
            progress(pool.imap_unordered(align, to_align), desc='Aligning', total=len(to_align)):
        if args.no_progress:
            index += 1
            logging.info(
                'Aligned file {} of {} - wrote results to "{}"'.format(
                    index, len(to_align), aligned_file))
        total_fragments += file_total_fragments
        dropped_fragments += file_dropped_fragments
        reasons += file_reasons

    logging.info('Aligned {} fragments'.format(total_fragments))
    if total_fragments > 0 and dropped_fragments > 0:
        logging.info('Dropped {} fragments {:0.2f}%:'.format(
            dropped_fragments, dropped_fragments * 100.0 / total_fragments))
        for key, number in reasons.most_common():
            logging.info(' - {}: {}'.format(key, number))

    logging.info('clean up DSalign')
    os.system("rm " + audio_chunks_path + "/* && rm " + transcript_lst_path +
              "/*")