def generate_subtitles(source_path, output, concurrency, subtitle_file_format,
                       api_key, secret_key):

    audio_filename, audio_rate = extract_audio(source_path)
    regions = find_speech_regions(audio_filename)
    pool = multiprocessing.Pool(concurrency)
    converter = WAVConverter(source_path=audio_filename)
    recognizer = SpeechRecognizer(api_key, secret_key, audio_rate,
                                  audio_filename[-3:])
    transcripts = []
    if regions:
        try:
            widgets = [
                "Converting speech regions to WAV files: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start()
            extracted_regions = []
            for i, extracted_region in enumerate(pool.imap(converter,
                                                           regions)):
                extracted_regions.append(extracted_region)
                pbar.update(i)
            pbar.finish()

            widgets = [
                "Performing speech recognition: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start()
            for i, transcript in enumerate(
                    pool.imap(recognizer, extracted_regions)):
                if transcript == 1:
                    return 0
                else:
                    transcripts.append(transcript)
                pbar.update(i)
            pbar.finish()
        except KeyboardInterrupt:
            pbar.finish()
            pool.terminate()
            pool.join()
            print("Cancelling transcription")
            return 0
    timed_subtitles = [(r, t) for r, t in zip(regions, transcripts) if t]
    formatter = FORMATTERS.get(subtitle_file_format)
    formatted_subtitles = formatter(timed_subtitles)
    dest = output
    if not dest:
        base, ext = os.path.splitext(source_path)
        dest = "{base}.{format}".format(base=base, format=subtitle_file_format)

    with open(dest, 'wb') as f:
        f.write(formatted_subtitles.encode("utf-8"))
    shutil.rmtree('temp')
    return dest
示例#2
0
def generate_subtitles(  # pylint: disable=too-many-locals,too-many-arguments
    source_path,
    output=None,
    concurrency=DEFAULT_CONCURRENCY,
    src_language=DEFAULT_SRC_LANGUAGE,
    dst_language=DEFAULT_DST_LANGUAGE,
    subtitle_file_format=DEFAULT_SUBTITLE_FORMAT,
    api_key=None,
):
    """
    Given an input audio/video file, generate subtitles in the specified language and format.
    """
    audio_filename, audio_rate = extract_audio(source_path)
    print("audio_filename: {}".format(audio_filename))
    regions = find_speech_regions(audio_filename)

    pool = multiprocessing.Pool(concurrency)
    converter = FLACConverter(source_path=audio_filename)
    recognizer = SpeechRecognizer(language=src_language,
                                  rate=audio_rate,
                                  api_key=GOOGLE_SPEECH_API_KEY)
    speaker_recognizer = SR(source_path=audio_filename)
    transcripts = []
    regions_update = []
    labels_update = []
    if regions:
        try:
            labels = []
            for region in regions:
                label = speaker_recognizer(region)
                labels.append(label)
            print("Labels {}:".format(labels))

            _label = []
            items = []
            labels_update = []
            start = None
            end = None
            i = 0
            for index, label in enumerate(labels):
                if len(_label) == 0:
                    _label.append(label)
                    start = index
                    labels_update.append(label)
                    i = i + 1
                elif label in _label and i < 10:
                    end = index
                    i = i + 1
                else:
                    item = (start, end)
                    items.append(item)
                    _label = []
                    _label.append(label)
                    start = index
                    end = None
                    labels_update.append(label)
                    i = 0
            print("items {}".format(items))
            print("labels {}".format(labels))
            print("labels_update {}".format(labels_update))
            for item in items:
                if item[1] == None:
                    start, end = regions[item[0]]
                    regions_update.append((start, end))
                else:
                    start1, end1 = regions[item[0]]
                    start2, end2 = regions[item[1]]
                    regions_update.append((start1, end2))
            print("regions_update {}".format(regions_update))
            widgets = [
                "Converting speech regions to FLAC files: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            process_bar = ProgressBar(widgets=widgets,
                                      maxval=len(regions_update)).start()
            extracted_regions = []
            for i, extracted_region in enumerate(
                    pool.imap(converter, regions_update)):
                extracted_regions.append(extracted_region)
                process_bar.update(i)
            process_bar.finish()

            widgets = [
                "Performing speech recognition: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            process_bar.widgets = widgets
            process_bar.maxval = len(regions_update)
            process_bar.start()
            for i, transcript in enumerate(
                    pool.imap(recognizer, extracted_regions)):
                transcripts.append(transcript)
                process_bar.update(i)
            print("transcripts {}".format(transcripts))
            process_bar.finish()
            if src_language.split("-")[0] != dst_language.split("-")[0]:
                if api_key:
                    google_translate_api_key = api_key
                    translator = Translator(dst_language,
                                            google_translate_api_key,
                                            dst=dst_language,
                                            src=src_language)
                    prompt = "Translating from {0} to {1}: ".format(
                        src_language, dst_language)
                    widgets = [prompt, Percentage(), ' ', Bar(), ' ', ETA()]
                    process_bar.widgets = widgets
                    process_bar.maxval = len(regions_update)
                    process_bar.start()
                    translated_transcripts = []
                    for i, transcript in enumerate(
                            pool.imap(translator, transcripts)):
                        translated_transcripts.append(transcript)
                        process_bar.update(i)
                    process_bar.finish()
                    transcripts = translated_transcripts
                else:
                    print(
                        "Error: Subtitle translation requires specified Google Translate API key. "
                        "See --help for further information.")
                    return 1

        except KeyboardInterrupt:
            process_bar.finish()
            pool.terminate()
            pool.join()
            print("Cancelling transcription")
            raise

    timed_subtitles = [
        (r, t, l)
        for r, t, l in zip(regions_update, labels_update, transcripts) if t
    ]
    formatter = FORMATTERS.get(subtitle_file_format)
    formatted_subtitles = formatter(timed_subtitles)
    print("formatted_subtitles {}: ".format(formatted_subtitles))
    # dest = output

    # if not dest:
    #     base = os.path.splitext(source_path)[0]
    #     dest = "{base}.{format}".format(base=base, format=subtitle_file_format)

    with open(output, 'wb') as output_file:
        output_file.write(formatted_subtitles.encode("utf-8"))

    os.remove(audio_filename)

    return output
示例#3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('source_path',
                        default=".\CCTV_News.MP4",
                        help="Path to the video or audio file to subtitle",
                        nargs='?')
    parser.add_argument('-C',
                        '--concurrency',
                        help="Number of concurrent API requests to make",
                        type=int,
                        default=10)
    parser.add_argument(
        '-o',
        '--output',
        help="Output path for subtitles (by default, subtitles are saved in \
                        the same directory and name as the source path)")
    parser.add_argument('-F',
                        '--format',
                        help="Destination subtitle format",
                        default="srt")
    parser.add_argument('-S',
                        '--src-language',
                        help="Language spoken in source file",
                        default="zh-CN")
    parser.add_argument('-D',
                        '--dst-language',
                        help="Desired language for the subtitles",
                        default="zh-CN")
    parser.add_argument(
        '-K',
        '--api-key',
        help=
        "The Google Translate API key to be used. (Required for subtitle translation)"
    )
    parser.add_argument('--list-formats',
                        help="List all available subtitle formats",
                        action='store_true')
    parser.add_argument('--list-languages',
                        help="List all available source/destination languages",
                        action='store_true')

    args = parser.parse_args()

    if args.list_formats:
        print("List of formats:")
        for subtitle_format in list(FORMATTERS.keys()):
            print(("{format}".format(format=subtitle_format)))
        return 0

    if args.list_languages:
        print("List of all languages:")
        for code, language in sorted(LANGUAGE_CODES.items()):
            print(("{code}\t{language}".format(code=code, language=language)))
        return 0

    if args.format not in list(FORMATTERS.keys()):
        print(
            "Subtitle format not supported. Run with --list-formats to see all supported formats."
        )
        return 1

    if args.src_language not in list(LANGUAGE_CODES.keys()):
        print(
            "Source language not supported. Run with --list-languages to see all supported languages."
        )
        return 1

    if args.dst_language not in list(LANGUAGE_CODES.keys()):
        print(
            "Destination language not supported. Run with --list-languages to see all supported languages."
        )
        return 1

    if not args.source_path:
        print("Error: You need to specify a source path.")
        return 1

    audio_filename, audio_rate = extract_audio(args.source_path)

    regions = find_speech_regions(audio_filename)

    pool = multiprocessing.Pool(args.concurrency)
    converter = FLACConverter(source_path=audio_filename)
    recognizer = SpeechRecognizer(language=args.src_language,
                                  rate=audio_rate,
                                  api_key=GOOGLE_SPEECH_API_KEY)

    transcripts = []
    if regions:
        try:
            widgets = [
                "Converting speech regions to FLAC files: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start()
            extracted_regions = []
            for i, extracted_region in enumerate(pool.imap(converter,
                                                           regions)):
                extracted_regions.append(extracted_region)
                pbar.update(i)
            pbar.finish()

            widgets = [
                "Performing speech recognition: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start()

            for i, transcript in enumerate(
                    pool.imap(recognizer, extracted_regions)):
                transcripts.append(transcript)
                pbar.update(i)
            pbar.finish()

            if not is_same_language(args.src_language, args.dst_language):
                if args.api_key:
                    google_translate_api_key = args.api_key
                    translator = Translator(args.dst_language,
                                            google_translate_api_key,
                                            dst=args.dst_language,
                                            src=args.src_language)
                    prompt = "Translating from {0} to {1}: ".format(
                        args.src_language, args.dst_language)
                    widgets = [prompt, Percentage(), ' ', Bar(), ' ', ETA()]
                    pbar = ProgressBar(widgets=widgets,
                                       maxval=len(regions)).start()
                    translated_transcripts = []
                    for i, transcript in enumerate(
                            pool.imap(translator, transcripts)):
                        translated_transcripts.append(transcript)
                        pbar.update(i)
                    pbar.finish()
                    transcripts = translated_transcripts
                else:
                    print(
                        "Error: Subtitle translation requires specified Google Translate API key. \
                    See --help for further information.")
                    return 1

        except KeyboardInterrupt:
            pbar.finish()
            pool.terminate()
            pool.join()
            print("Cancelling transcription")
            return 1

    timed_subtitles = [(r, t) for r, t in zip(regions, transcripts) if t]
    formatter = FORMATTERS.get(args.format)
    formatted_subtitles = formatter(timed_subtitles)

    dest = args.output

    if not dest:
        base, ext = os.path.splitext(args.source_path)
        dest = "{base}.{format}".format(base=base, format=args.format)

    with open(dest, 'wb') as f:
        f.write(formatted_subtitles.encode("utf-8"))

    print("Subtitles file created at {}".format(dest))

    os.remove(audio_filename)

    return 0
def generate_subtitles(  # pylint: disable=too-many-locals,too-many-arguments
    source_path,
    output=None,
    concurrency=DEFAULT_CONCURRENCY,
    src_language=DEFAULT_SRC_LANGUAGE,
    dst_language=DEFAULT_DST_LANGUAGE,
    subtitle_file_format=DEFAULT_SUBTITLE_FORMAT,
    api_key=None,
):
    """
    Given an input audio/video file, generate subtitles in the specified language and format.
    """
    audio_filename, audio_rate = extract_audio(source_path)

    regions = find_speech_regions(audio_filename)

    pool = multiprocessing.Pool(concurrency)
    converter = FLACConverter(source_path=audio_filename)
    create()

    transcripts = []
    if regions:
        try:
            widgets = [
                "Converting speech regions to FLAC files: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start()
            extracted_regions = []
            for i, extracted_region in enumerate(pool.imap(converter,
                                                           regions)):
                extracted_regions.append(extracted_region)
                pbar.update(i)
            with open("./data/cctv/cctv.txt", 'w+') as f:
                for j in extracted_regions:
                    f.write('{}\n'.format(j))
            f.close()
            pbar.finish()

            transcripts = SpeechRecognizer()

        except KeyboardInterrupt:
            pbar.finish()
            pool.terminate()
            pool.join()
            print("Cancelling transcription")
            raise

    bar = progressbar.ProgressBar(widgets=[
        progressbar.Percentage(),
        progressbar.Bar(),
        ' (',
        progressbar.SimpleProgress(),
        ') ',
        ' (',
        progressbar.ETA(),
        ') ',
    ])
    timed_subtitles = [(r, t) for r, t in bar(zip(regions, transcripts)) if t]
    formatter = FORMATTERS.get(subtitle_file_format)
    formatted_subtitles = formatter(timed_subtitles)
    dest = output

    if not dest:
        base = os.path.splitext(source_path)[0]
        dest = "{base}.{format}".format(base=base, format=subtitle_file_format)

    with open(dest, 'wb') as output_file:
        output_file.write(formatted_subtitles.encode("utf-8"))

    os.remove(audio_filename)

    return dest
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('source_path',
                        default="./CCTV_News.mp4",
                        help="Path to the video or audio file to subtitle",
                        nargs='?')
    parser.add_argument('-C',
                        '--concurrency',
                        help="Number of concurrent API requests to make",
                        type=int,
                        default=10)
    parser.add_argument(
        '-o',
        '--output',
        help="Output path for subtitles (by default, subtitles are saved in \
                        the same directory and name as the source path)")
    parser.add_argument('-F',
                        '--format',
                        help="Destination subtitle format",
                        default="srt")
    parser.add_argument('-S',
                        '--src-language',
                        help="Language spoken in source file",
                        default="zh-CN")
    parser.add_argument('-D',
                        '--dst-language',
                        help="Desired language for the subtitles",
                        default="zh-CN")
    parser.add_argument(
        '-K',
        '--api-key',
        help=
        "The Google Translate API key to be used. (Required for subtitle translation)"
    )
    parser.add_argument('--list-formats',
                        help="List all available subtitle formats",
                        action='store_true')
    parser.add_argument('--list-languages',
                        help="List all available source/destination languages",
                        action='store_true')

    args = parser.parse_args()

    if args.list_formats:
        print("List of formats:")
        for subtitle_format in FORMATTERS.keys():
            print("{format}".format(format=subtitle_format))
        return 0

    if args.list_languages:
        print("List of all languages:")
        for code, language in sorted(LANGUAGE_CODES.items()):
            print("{code}\t{language}".format(code=code, language=language))
        return 0

    if args.format not in FORMATTERS.keys():
        print(
            "Subtitle format not supported. Run with --list-formats to see all supported formats."
        )
        return 1

    if args.src_language not in LANGUAGE_CODES.keys():
        print(
            "Source language not supported. Run with --list-languages to see all supported languages."
        )
        return 1

    if args.dst_language not in LANGUAGE_CODES.keys():
        print(
            "Destination language not supported. Run with --list-languages to see all supported languages."
        )
        return 1

    if not args.source_path:
        print("Error: You need to specify a source path.")
        return 1
    audio_filename, audio_rate = extract_audio(args.source_path)
    regions = find_speech_regions(audio_filename)
    pool = multiprocessing.Pool(args.concurrency)
    converter = WAVConverter(source_path=audio_filename, slicenum=len(regions))
    if regions:
        try:
            widgets = [
                "Converting speech regions to WAVC files: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start()
            extracted_regions = []
            for i, extracted_region in enumerate(pool.imap(converter,
                                                           regions)):
                extracted_regions.append(extracted_region)
                pbar.update(i)
            pbar.finish()

            os.remove(audio_filename)
            wavlist = create_manifest(os.getcwd() + '/temp',
                                      os.getcwd() + '/temp' + '/wavlist.txt')
            transcripts = infer.infer_interface(wavlist,
                                                len(extracted_regions))
        except KeyboardInterrupt:
            pbar.finish()
            pool.terminate()
            pool.join()
            print("Cancelling transcription")
            return 1

    timed_subtitles = [(r, t) for r, t in zip(regions, transcripts) if t]
    formatter = FORMATTERS.get(args.format)
    formatted_subtitles = formatter(timed_subtitles)
    dest = args.output
    if not dest:
        base, ext = os.path.splitext(args.source_path)
        dest = "{base}.{format}".format(base=base, format=args.format)
    with open(dest, 'wb') as f:
        f.write(formatted_subtitles.encode("utf-8"))
    print("Subtitles file created at {}".format(dest))
    shutil.rmtree('temp')
    return 0
示例#6
0
def generate_subtitles(  # pylint: disable=too-many-locals,too-many-arguments
    source_path,
    output=None,
    concurrency=DEFAULT_CONCURRENCY,
    src_language=DEFAULT_SRC_LANGUAGE,
    dst_language=DEFAULT_DST_LANGUAGE,
    subtitle_file_format=DEFAULT_SUBTITLE_FORMAT,
    api_key=None,
):
    """
    Given an input audio/video file, generate subtitles in the specified language and format.
    """

    audio_filename, audio_rate = extract_audio(source_path)

    regions = find_speech_regions(audio_filename)

    pool = multiprocessing.Pool(concurrency)
    converter = FLACConverter(source_path=audio_filename)
    recognizer = SpeechRecognizer(language=src_language,
                                  rate=audio_rate,
                                  api_key=GOOGLE_SPEECH_API_KEY)

    transcripts = []

    if regions:
        try:
            widgets = [
                "Converting speech regions to FLAC files: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start()
            extracted_regions = []

            for i, extracted_region in enumerate(pool.imap(converter,
                                                           regions)):
                extracted_regions.append(extracted_region)
                pbar.update(i)
            pbar.finish()

            widgets = [
                "Performing speech recognition: ",
                Percentage(), ' ',
                Bar(), ' ',
                ETA()
            ]
            pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start()

            for i, transcript in enumerate(
                    pool.imap(recognizer, extracted_regions)):
                transcripts.append(transcript)
                pbar.update(i)
            pbar.finish()

            if src_language.split("-")[0] != dst_language.split("-")[0]:
                if api_key:
                    google_translate_api_key = api_key
                    translator = Translator(dst_language,
                                            google_translate_api_key,
                                            dst=dst_language,
                                            src=src_language)
                    prompt = "Translating from {0} to {1}: ".format(
                        src_language, dst_language)
                    widgets = [prompt, Percentage(), ' ', Bar(), ' ', ETA()]
                    pbar = ProgressBar(widgets=widgets,
                                       maxval=len(regions)).start()
                    translated_transcripts = []
                    for i, transcript in enumerate(
                            pool.imap(translator, transcripts)):
                        translated_transcripts.append(transcript)
                        pbar.update(i)
                    pbar.finish()
                    transcripts = translated_transcripts
                else:
                    print(
                        "Error: Subtitle translation requires specified Google Translate API key. "
                        "See --help for further information.")
                    return 1

        except KeyboardInterrupt:
            pbar.finish()
            pool.terminate()
            pool.join()
            print("Cancelling transcription")
            raise

    timed_subtitles = [(r, t) for r, t in zip(regions, transcripts) if t]
    formatter = FORMATTERS.get(subtitle_file_format)
    formatted_subtitles = formatter(timed_subtitles)

    dest = output

    if not dest:
        base = os.path.splitext(source_path)[0]
        dest = "{base}.{format}".format(base=base, format=subtitle_file_format)

    with open(dest, 'wb') as output_file:
        output_file.write(formatted_subtitles.encode("utf-8"))

    os.remove(audio_filename)

    return dest