if not os.path.exists(subtitle_file) or not os.path.exists(info_file): termcolor.cprint( "Subtitle file or Info files do not exist. {}".format( video_file), color="red") raise Exception("Subtitle file or Info files do not exist.") # Download google subtitle to cross check with closed captions with open(info_file) as f: print(info_file) print("opening json info file") metadata = json.load(f) print("opened json info file") #youtube_link = metadata['webpage_url'] print("Parsing subtitle") subtitles = load_all_subtitles(subtitle_file) print(len(subtitles)) input = {'subtitles': subtitles, 'video_file': video_file} overall_info["num_subtitles"] = len(subtitles) termcolor.cprint("Got {} candidates".format(len(subtitles)), color="yellow") filtered_input = pipeline(input) filtered_subtitles = filtered_input["subtitles"] termcolor.cprint("Writing {} samples".format(len(filtered_subtitles)), color="cyan") for t in tqdm(filtered_subtitles): hash = get_hash(subtitle_file + t["original_phrase"] + str(t["ts_start"])) wav_file_dir = os.path.join(target_dir, "wav", hash[:2])
subtitles = [] else: overlap_ratio = [ratio(t["phrase"].lower(), s.lower()) for (t, s) in transcripts] passed_threshold = sum(overlap_ratio) / \ len(overlap_ratio) > self.mean_wer_threshold if not passed_threshold: # removing all subtitles, as potentially unreliable subtitles = [] input["subtitles"] = subtitles return input if __name__ == "__main__": from youtube_helpers import load_all_subtitles subtitles = load_all_subtitles( "../EWmCkVfPnJ8Different_Types_of_Learning.en.vtt") print(len(subtitles)) input = { 'subtitles': subtitles, 'video_file': '' } good_chars_regexp = re.compile( r"^[A-Za-z0-9\,\.\-\?\"\'\’\!\“\s\;\:\“\”\–\‘\’\’\/\\]+$", re.IGNORECASE) pipeline = Pipeline([ OverlappingSubtitlesRemover(), SubtitleCaptionTextFilter(), CaptionNormalizer(), CaptionRegexMatcher(good_chars_regexp), CaptionLengthFilter(min_length=5), CaptionLeaveOnlyAlphaNumCharacters(), SubtitleMerger(max_len_merged_sec=10),