示例#1
0
        if not os.path.exists(subtitle_file) or not os.path.exists(info_file):
            termcolor.cprint(
                "Subtitle file or Info files do not exist. {}".format(
                    video_file),
                color="red")
            raise Exception("Subtitle file or Info files do not exist.")

        # Download google subtitle to cross check with closed captions
        with open(info_file) as f:
            print(info_file)
            print("opening json info file")
            metadata = json.load(f)
            print("opened json info file")
        #youtube_link = metadata['webpage_url']
        print("Parsing subtitle")
        subtitles = load_all_subtitles(subtitle_file)
        print(len(subtitles))
        input = {'subtitles': subtitles, 'video_file': video_file}
        overall_info["num_subtitles"] = len(subtitles)
        termcolor.cprint("Got {} candidates".format(len(subtitles)),
                         color="yellow")

        filtered_input = pipeline(input)
        filtered_subtitles = filtered_input["subtitles"]

        termcolor.cprint("Writing {} samples".format(len(filtered_subtitles)),
                         color="cyan")
        for t in tqdm(filtered_subtitles):
            hash = get_hash(subtitle_file + t["original_phrase"] +
                            str(t["ts_start"]))
            wav_file_dir = os.path.join(target_dir, "wav", hash[:2])
示例#2
0
            subtitles = []
        else:
            overlap_ratio = [ratio(t["phrase"].lower(), s.lower())
                             for (t, s) in transcripts]
            passed_threshold = sum(overlap_ratio) / \
                len(overlap_ratio) > self.mean_wer_threshold
            if not passed_threshold:
                # removing all subtitles, as potentially unreliable
                subtitles = []
        input["subtitles"] = subtitles
        return input


if __name__ == "__main__":
    from youtube_helpers import load_all_subtitles
    subtitles = load_all_subtitles(
        "../EWmCkVfPnJ8Different_Types_of_Learning.en.vtt")
    print(len(subtitles))
    input = {
        'subtitles': subtitles,
        'video_file': ''
    }
    good_chars_regexp = re.compile(
        r"^[A-Za-z0-9\,\.\-\?\"\'\’\!\“\s\;\:\“\”\–\‘\’\’\/\\]+$", re.IGNORECASE)
    pipeline = Pipeline([
        OverlappingSubtitlesRemover(),
        SubtitleCaptionTextFilter(),
        CaptionNormalizer(),
        CaptionRegexMatcher(good_chars_regexp),
        CaptionLengthFilter(min_length=5),
        CaptionLeaveOnlyAlphaNumCharacters(),
        SubtitleMerger(max_len_merged_sec=10),