def test_save_specific_filename_no_extension(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) output_file = os.path.join(target_path, 'custom_name') webvtt.read(self._get_file('one_caption.vtt')).save(output_file) self.assertTrue(os.path.exists(os.path.join(target_path, 'custom_name.vtt')))
def test_save_specific_filename(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) output_file = os.path.join(target_path, 'custom_name.vtt') webvtt.read(self._get_file('one_caption.vtt')).save(output_file) self.assertTrue(os.path.exists(output_file))
def test_save_to_other_location(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) webvtt.read(self._get_file('one_caption.vtt')).save(target_path) self.assertTrue( os.path.exists(os.path.join(target_path, 'one_caption.vtt')))
def convert_file(file_name, main_lang, sub_lang): file_name_sub = rreplace(file_name, main_lang, sub_lang) #print(file_name + '\n' + file_name_sub) #return vtt_main = webvtt.read(file_name) vtt_sub = webvtt.read(file_name_sub) # while loop all korean time captions index_main = 0 index_sub = 0 while index_main < len(vtt_main): while index_sub < len(vtt_sub): caption_main = vtt_main[index_main] caption_sub = vtt_sub[index_sub] if (caption_main.start <= caption_sub.start): #print("##### " + caption_main.text.replace("‎","").replace("\n","\n##### ")) print("<h3>" + caption_main.text.replace("‎", "") + "</h3>") break else: print("<p>" + caption_sub.text.replace("‎", "") + "</p>") print("") index_sub += 1 index_main += 1 # finish final z index while index_sub < len(vtt_sub): print(caption_sub.text) index_sub += 1
def _parse_vtt_file(file_path) -> List[Caption]: captions: List[Caption] = [] for file_caption in webvtt.read(file_path): for line in file_caption.lines: if '<c>' not in line: continue caption = Caption(is_word_aligned=True, start=file_caption.start_in_seconds, end=file_caption.end_in_seconds) first = re.match(pattern_first, line, re.M | re.I) first_word, start = Video._remove_tags(first[0]) # Estimate the time here because it is not given in the vtt file if captions: junction = round((start + captions[-1][-1].start) / 2, 3) if junction > 1: junction = start - 1 captions[-1][-1].end = junction first_word = Word(text=first_word, start=junction) else: first_word = Word( text=first_word, start=max(round(start - 1, 3), 0)) caption.append(first_word) rest = re.findall(pattern_rest, line, re.M | re.I) for match in rest: next_word, start = Video._remove_tags(match) if len(caption) == 1 and not re.search(r'\w', caption[0].text): del caption[0] else: caption[-1].end = start next_word = Word(text=next_word, start=start) caption.append(next_word) captions.append(caption) break # If file is not word aligned with video if not captions: for file_caption in webvtt.read(file_path): caption = Caption(is_word_aligned=False, start=file_caption.start_in_seconds, end=file_caption.end_in_seconds) for word in file_caption.raw_text.split(): caption.append(Word(word.strip())) if caption: caption[0].start = file_caption.start_in_seconds caption[-1].end = file_caption.end_in_seconds captions.append(caption) else: # Also here, estimate the time which is not given in the vtt file captions[-1][-1].end = round(captions[-1][-1].start + 1, 3) return captions
def test_save_identifiers(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR) vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt')) vtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt')) with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', 'second caption', '00:00:07.000 --> 00:00:11.890', 'Caption text #2', '', '00:00:11.890 --> 00:00:16.320', 'Caption text #3', '', '4', '00:00:16.320 --> 00:00:21.580', 'Caption text #4', '', '00:00:21.580 --> 00:00:23.880', 'Caption text #5', '', '00:00:23.880 --> 00:00:27.280', 'Caption text #6' ] self.assertListEqual(lines, expected_lines)
def test_write_captions(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.vtt'), OUTPUT_DIR) out = io.StringIO() vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt')) new_caption = Caption('00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2']) vtt.captions.append(new_caption) vtt.write(out) out.seek(0) lines = [line.rstrip() for line in out.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1', 'New caption text line2' ] self.assertListEqual(lines, expected_lines)
def getBreakdown(vtt): breakdowns = [] for caption in webvtt.read(vtt): name = getName(caption.text) if name == "NO NAME FOUND": pass elif searchName(name, breakdowns): index = findName(name, breakdowns) breakdowns[index][1] = breakdowns[index][1] + timeDiff( caption.start, caption.end) breakdowns[index][2] = breakdowns[index][2] + " " + getText( caption.text) else: breakdowns.append([ name, timeDiff(caption.start, caption.end), getText(caption.text) ]) breakdowns = roundit(breakdowns) breakdowns = getSentiment(breakdowns) for i in breakdowns: print(i[0], " talked for ", i[1], " seconds") print("Sentiment: ") print(i[2]) #print(caption.start) # start timestamp in text format #print(caption.end) # end timestamp in text format #print(caption.text) # caption texk return breakdowns #getBreakdown("Example Transcript.vtt") # getBreakdown("94923151321_audio_transcript_first-try.vtt") # getBreakdown("94923151321_audio_transcript.vtt")
def parse_subtitles(): filename = 'reference.vtt' f = open(filename) html = "" if filename[-4:] == '.vtt': for caption in webvtt.read(filename): time = caption.start[-6:-4] text = caption.text html += '<div class="subtitle-line">\n' html += '<a id="' + str(time) + '" href="#">' html += text html += '</a>\n</div>\n\n<br>\n\n' elif filename[-4:] == '.srt': print("reading SRT file") subtitle_generator = srt.parse(f.read()) subtitles = list(subtitle_generator) for line in subtitles: time = line.start.seconds text = line.content html += '<div class="subtitle-line">\n' html += '<a id="' + str(time) + '" href="#">' html += text html += '</a>\n</div>\n\n<br>\n\n' else: print("this script only accepts vtt or srt files") return html
def get_caption(url): global video_title # Using Youtube-dl inside python ydl_opts = { 'skip_download': True, # Skipping the download of actual file 'writesubtitles': True, # Uploaded Subtitles "writeautomaticsub": True, # Auto generated Subtitles "subtitleslangs": ['en'], # Language Needed "en"-->English 'outtmpl': 'test.%(ext)s', # Saving downloaded file as 'test.en.vtt' 'nooverwrites': False, # Overwrite if the file exists 'quiet': True # Printing progress } with youtube_dl.YoutubeDL(ydl_opts) as ydl: try: ydl.download([url]) info_dict = ydl.extract_info(url, download=False) video_title = info_dict.get('title', None) except: print("Try with a YouTube URL") corpus = [] for caption in webvtt.read('test.en.vtt'): corpus.append(caption.text) corpus = "".join(corpus) corpus = corpus.replace('\n', ' ') return corpus
def to_vtt_format(self, attr: str = 'raw_text') -> None: """attr: {start, end, text, raw_text, identifier}""" for i, caption in enumerate(webvtt.read(self.file_path)): print('------') print(attr.upper()) print(getattr(caption, attr))
def main(file_loc): transcript = "" lines = [] files = [os.path.join(file_loc, f) for f in os.listdir(file_loc) if f.endswith(".vtt")] for f in files: vtt = webvtt.read(f) for line in vtt: # Strip the newlines from the end of the text. # Split the string if it has a newline in the middle # Add the lines to an array lines.extend(line.text.strip().splitlines()) # Remove repeated lines previous = None for line in lines: line = line.replace("&", "&") if line == previous: continue if transcript == "": transcript = line else: transcript += "\n" + line previous = line previous = previous.strip() filename = os.path.basename(os.path.normpath(file_loc)) with open(f"cleaned_{filename}.txt", "w", encoding='utf8', errors="ignore") as f: f.write(transcript) print(f"Saved to cleaned_{filename}.txt")
def test_parse_with_comments(self): vtt = webvtt.read(self._get_file('comments.vtt')) self.assertEqual(len(vtt.captions), 3) self.assertListEqual( vtt.captions[0].lines, ['- Ta en kopp varmt te.', '- Det är inte varmt.']) self.assertEqual(vtt.captions[2].text, '- Ta en kopp')
def test_parse_identifiers(self): vtt = webvtt.read(self._get_file('using_identifiers.vtt')) self.assertEqual(len(vtt.captions), 6) self.assertEqual(vtt.captions[1].identifier, 'second caption') self.assertEqual(vtt.captions[2].identifier, None) self.assertEqual(vtt.captions[3].identifier, '4')
def generate_transcript(file): """Generates the transcript of a given .vtt file.""" if len(file) < 4 or file[-4:] != ".vtt": print("Error: non .vtt file passed in.") return "" vtt = webvtt.read(file) captions = sum([c.text.strip().splitlines() for c in vtt], []) brackets = ("[", "]") parenthesis = ("(", ")") captions_of_lecture = list( map(lambda text: ignore_descriptions(text, brackets), captions)) captions_of_lecture = list( map(lambda text: ignore_descriptions(text, parenthesis), captions_of_lecture)) captions_of_lecture = list(map(ignore_names, captions_of_lecture)) transcript = "" prev_cap = "" for i in range(len(captions_of_lecture)): curr_cap = captions_of_lecture[i] if i != 0: prev_cap = captions_of_lecture[i - 1] if prev_cap != curr_cap: transcript += curr_cap + " " return transcript
def identify_file_format_data(filename, file_format, choose_index): if file_format == 'text/csv': data = pd.read_csv(filename, encoding="ISO-8859-1") multiple_session_data = manage_multiple_session(data) if choose_index is not None: encounter_transcripts = [ data['Encounter - Transcript'][choose_index] ] else: encounter_transcripts = data['Encounter - Transcript'] return encounter_transcripts, data if file_format == 'text/vtt': transcript = [] data = None for caption in webvtt.read(filename): splited_text = caption.text.split(':') len_splited_text = len(splited_text) if len_splited_text > 2: transcript.append(caption.text) elif len_splited_text == 2: transcript.append(splited_text[1]) else: transcript.append(splited_text[0]) encounter_transcripts = [transcript] return encounter_transcripts, data
def test_save_identifiers(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR) vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt')) vtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt')) with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', 'second caption', '00:00:07.000 --> 00:00:11.890', 'Caption text #2', '', '00:00:11.890 --> 00:00:16.320', 'Caption text #3', '', '4', '00:00:16.320 --> 00:00:21.580', 'Caption text #4', '', '00:00:21.580 --> 00:00:23.880', 'Caption text #5', '', '00:00:23.880 --> 00:00:27.280', 'Caption text #6' ] self.assertListEqual(lines, expected_lines)
def test_parse_styles(self): vtt = webvtt.read(self._get_file('styles.vtt')) self.assertEqual(len(vtt.captions), 1) self.assertEqual( vtt.styles[0].text, '::cue {background-image: linear-gradient(to bottom, dimgray, lightgray);color: papayawhip;}' )
def get_words_from_subtitles(file): "Get the words from the whole serie" tknzr = TweetTokenizer() file_words = [] for caption in webvtt.read(file): file_words.extend(tknzr.tokenize(caption.text)) return file_words
def make_caption_data(video_element_name, caption_path, timecodes, duration, fps, punct, mode, segmenter): start, end = Timecode(fps, timecodes[0]), Timecode(fps, timecodes[1]) captions = webvtt.read(caption_path) captions[1].start = captions[0].start captions = captions[1:] caption_dict_list, joined_sentence = make_caption_dict_list( captions, fps, start, end, mode) sentences = segement_sentences(joined_sentence, segmenter, punct) if len(caption_dict_list) > 0: timestamps = make_timestamps(caption_dict_list, sentences, mode) try: assert len(timestamps) == len( sentences ), f'timestamps:{len(timestamps)} sentences:{len(sentences)}' except AssertionError as err: print('AssertionError:', err) annotation = { video_element_name: { 'duration': duration, 'timestamps': timestamps, 'sentences': sentences } } return annotation else: return {}
def generate_captions_vtt(caption_path, time_per_segment, cutoff): """ Function that returns a lists of captions from a given caption '.vtt' file Each entry in the list is the captions that were stated in the specified time_per_segment """ vtt = webvtt.read(caption_path) caption_dict = { 'time': [cap.start for cap in vtt.captions], 'caption': [cap.text.split('\n') for cap in vtt.captions] } cap_df = pd.DataFrame(caption_dict) cap_df['time'] = cap_df['time'].apply(lambda x: pd.to_datetime(x)) cap_df['agg_time'] = cap_df['time'].apply(lambda x: (x - cap_df['time'].iloc[0]).seconds - cutoff) caption_segments = [] time_steps = [t for t in range(0, cap_df['agg_time'].iloc[-1] - cutoff, time_per_segment)] for idx in range(len(time_steps) - 1): acc = [] for line in cap_df[(time_steps[idx] <= cap_df['agg_time']) & (cap_df['agg_time'] < time_steps[idx + 1]) ]['caption'].values: acc += line seen = set() ordered_uniques = [line for line in acc if not (line in seen or seen.add(line))] caption_segments.append(' '.join(ordered_uniques)) return caption_segments
def extract_plaintext_from_webvtt(path_to_file: str): """Extract plaintext from a webvtt file. Keyword arguments: path_to_file -- string -- full path to webvtt file Return value: transcript_text -- string -- plaintext of transcript """ logger.debug("generate_plaintext_transcript: " + path_to_file) vtt = webvtt.read(path_to_file) transcript: str = "" # loop over text lines and create one block of plain text with spaces for line in vtt: transcript += line.text.strip() + " " # remove trailing space transcript_text = transcript.strip() logger.debug("Text extracted from: " + path_to_file) return transcript_text
def convert_intermedate_form(sub_path, files, id, save_location): segments = [] text = [] i = 0 track = convert_to_spec(files) try: sub = webvtt.read(sub_path) sub = sub[10:-10] for caption in sub: clean_text = caption.text.replace( '\n', ' ').replace(',', ' ').replace('-', ' ') clean_text = ''.join([character for character in clean_text if ( character.isalpha() or character == ' ')]) start = (caption.start.split(":")) s = float(start[0])*3600+float(start[1])*60+float(start[2]) end = (caption.end.split(":")) e = float(end[0])*3600+float(end[1])*60+float(end[2]) temp = track[s*1000:e*1000] if len(clean_text.split(' ')) <= 3 or (e-s) < 3 or (e-s) > 20: continue if not path.exists(save_location+'/'+id+'/'): os.makedirs(save_location+'/'+id+'/') with open(save_location+'/'+id+'/'+str(i)+".txt", "w") as text_file: text_file.write(clean_text.replace('\n', ' ')) temp.export(save_location + '/' + id+'/'+str(i)+".wav", format="wav") i += 1 except MalformedCaptionError as e: pass
def parse_subs_into_word_tockens_list(source_path, subs_file): result = [] for caption in webvtt.read(source_path): arr = caption.raw_text.split('\n') if(len(arr) > 1): line = arr[1] line = re.sub('[<][\/]?[c][^<]*[>]', "", line) line = line.replace(" ", "").lower() if line: line = "<" + caption.start + ">" + line + "<" + caption.end + ">" tockens = list(filter(None, re.split('[<>]', line))) count = 0 while count < len(tockens) - 2: start = tockens[count] text = tockens[count + 1] end = tockens[count + 2] toAppend = { 'filename': subs_file, 'start': start, 'end': end, 'text': text } result.append(toAppend) count += 2 return result
def file_writing(path): vtt = WebVTT() caption = Caption() emotion = "" for line in webvtt.read('static/subtitle.vtt'): emotion = predict(str(line.text)) if emotion is "joy": caption = Caption( line.start, line.end, "<c.green> " + emotion + ": " + line.text + "</c>") elif emotion is "fear": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "anger": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "sadness": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "neutral": caption = Caption( line.start, line.end, "<c.blue> " + emotion + ": " + line.text + "</c>") else: caption = Caption( line.start, line.end, "<c.blue> " + emotion + ": " + line.text + "</c>") vtt.captions.append(caption) vtt.save('static/my_captions.vtt')
def test_save_updated_identifiers(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR) vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt')) vtt.captions[0].identifier = 'first caption' vtt.captions[1].identifier = None vtt.captions[3].identifier = '44' last_caption = Caption('00:00:27.280', '00:00:29.200', 'Caption text #7') last_caption.identifier = 'last caption' vtt.captions.append(last_caption) vtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt')) with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', 'first caption', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'Caption text #2', '', '00:00:11.890 --> 00:00:16.320', 'Caption text #3', '', '44', '00:00:16.320 --> 00:00:21.580', 'Caption text #4', '', '00:00:21.580 --> 00:00:23.880', 'Caption text #5', '', '00:00:23.880 --> 00:00:27.280', 'Caption text #6', '', 'last caption', '00:00:27.280 --> 00:00:29.200', 'Caption text #7' ] self.assertListEqual(lines, expected_lines)
def Subtitles_Processor(subtitle,video_title,video_title_summary,video_categories_list): #OPENS A .VTT FILE, REPLACE SIMBOLS, PARCE IT AND INSERT EVERY SINGLE WORD INTO A DATABASE. print(("INSERTING DATA INTO {} TABLE. THIS PROCESS CAN TAKE A WHILE.").format(video_title)) for caption in webvtt.read(PATH+subtitle): minute=(caption.start)[3:5] seconds=(caption.start)[6:8] caption=caption.text #REPLACING UNNECESARY SYMBOLS. caption=caption.replace("\\"," ") caption=caption.replace("\n"," ") caption=caption.replace("'d"," would") caption=caption.replace("'t", " not") caption=caption.replace("'s", " is") caption=caption.replace("'m"," am") caption=caption.replace("'ll", " will") caption=caption.replace("'re", " are") caption=caption.replace('?'," ") caption=caption.replace('"'," ") caption=caption.replace("."," ") caption=caption.replace(','," ") caption=caption.replace('-'," ") caption=caption.replace(':'," ") caption=caption.replace(';',"") caption=caption.replace("'","") #SPLITTING SENTENCES INTO WORDS sentence=caption.split(' ') for word in sentence: #INSERT WORD BY WORD INTO THE VIDEO'S TABLE word=word.lower() InsertData(minute, seconds, word,video_title,video_title_summary,video_categories_list) print("COMPLETED\n----------------\n")
def estimate_timing(filename): pieces = [] for caption in webvtt.read(filename): start, end = caption.start_in_seconds, caption.end_in_seconds speaker, text = split_speaker(caption.text) all_stresses = [] for word in text.split(): stress_string = stresses(word) if word.endswith(","): stress_string += "3" elif word.endswith("."): stress_string += "4" all_stresses.append(stress_string) stress_pattern = "".join(all_stresses) stress_pattern = stress_pattern.rstrip("4") dt = end - start n_beats = 0 for stress_type in stress_pattern: n_beats += PAUSE_PROPORTION.get(stress_type, 1) pieces.append(n_beats / dt) tempo_distribution = remove_outliers(pieces) beats_per_second = max(tempo_distribution) return 6
def test_write_captions(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.vtt'), OUTPUT_DIR) out = io.StringIO() vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt')) new_caption = Caption('00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2']) vtt.captions.append(new_caption) vtt.write(out) out.seek(0) lines = [line.rstrip() for line in out.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1', 'New caption text line2' ] self.assertListEqual(lines, expected_lines)
def test_parse_styles(self): vtt = webvtt.read(self._get_file('styles.vtt')) self.assertEqual(len(vtt.captions), 1) self.assertEqual( vtt.styles[0].text, '::cue {background-image: linear-gradient(to bottom, dimgray, lightgray);color: papayawhip;}' )
def cutYoutube(self, src="audio"): outputList = { "vttPath": self.vttPath, "wavPath": self.wavPath, "name": self.v, "sub_lang": self.sub_lang, "text": "", "start": "", "end": "", "key": "", "cutFile": "" } i = 0 sound = AudioSegment.from_file(self.wavPath) for caption in webvtt.read(self.vttPath): outputList['text'] = (caption.text.replace("\n", "")) # caption text outputList['start'] = (caption.start) # caption text outputList['end'] = (caption.end) # caption text outputList['cutFile'] = src + "/" + self.v + \ "_" + str(i) + "."+self.audioType # caption text first_half = sound[timemath(caption.start):timemath(caption.end)] first_half.export(src + "/" + self.v + "_" + str(i) + ".wav", format="wav") i = i + 1 self.saveCSV(outputList)
def video_indices(subtitle, text_summary, tolerance=0): text_summary = open(text_summary, "r").read() sents = [word_tokenize(s.lower()) for s in sent_tokenize(text_summary)] captions = webvtt.read(subtitle) cuttimes = [] for sent in sents: beg = ' '.join(sent[:5]) last = ' '.join(sent[-6:-1]) for i, caption in enumerate(captions): captext = caption.text.replace("\n", " ") captext = filter_captext(captext) if beg in captext: start = caption.start for j, cap in enumerate(captions): if i > j: continue captext = cap.text.replace("\n", " ") captext = filter_captext(captext) if last in captext: end = caption.end break timestamp = [start, end] cuttimes.append([ min(timestamp, key=time_for_one), max(timestamp, key=time_for_one) ]) break return cutttimes_filter(cuttimes, 10, 1000)
def test_parse_identifiers(self): vtt = webvtt.read(self._get_file('using_identifiers.vtt')) self.assertEqual(len(vtt.captions), 6) self.assertEqual(vtt.captions[1].identifier, 'second caption') self.assertEqual(vtt.captions[2].identifier, None) self.assertEqual(vtt.captions[3].identifier, '4')
def vtt_to_df(vtt_filename): vtt=webvtt.read(vtt_filename) lines = [] starts = [] ends = [] for line in vtt: extend_text=line.text.strip().splitlines() repeat=len(extend_text) lines.extend(extend_text) starts.extend([line.start] * repeat) ends.extend([line.end] * repeat) previous = None new_lines=[] new_starts=[] new_ends=[] for l,s,e in zip(lines,starts,ends): if l == previous: continue else: new_lines.append(l) new_starts.append(s) new_ends.append(e) previous = l df={"start":new_starts,"end":new_ends,"text":new_lines} df=pd.DataFrame(df) return df
def from_vtt_file(self, filepath): for caption in webvtt.read(filepath): h, m, s = caption.start.split(':') start_time = datetime.timedelta(hours=float(h), minutes=float(m), seconds=float(s)).total_seconds() h, m, s = caption.end.split(':') end_time = datetime.timedelta(hours=float(h), minutes=float(m), seconds=float(s)).total_seconds() start_frame = round(start_time * self.item.fps) annotation_definition = entities.Subtitle(text=caption.text, label='Text') annotation = entities.Annotation.new( annotation_definition=annotation_definition, frame_num=start_frame, item=self.item, start_time=start_time) annotation.add_frames(annotation_definition=annotation_definition, frame_num=start_frame, end_time=end_time) self.annotations.append(annotation)
def test_webvtt_parse_get_caption_data(self): vtt = webvtt.read(self._get_file('one_caption.vtt')) self.assertEqual(vtt.captions[0].start_in_seconds, 0.5) self.assertEqual(vtt.captions[0].start, '00:00:00.500') self.assertEqual(vtt.captions[0].end_in_seconds, 7) self.assertEqual(vtt.captions[0].end, '00:00:07.000') self.assertEqual(vtt.captions[0].lines[0], 'Caption text #1') self.assertEqual(len(vtt.captions[0].lines), 1)
def test_captions_prevent_write(self): vtt = webvtt.read(self._get_file('sample.vtt')) self.assertRaises( AttributeError, setattr, vtt, 'captions', [] )
def test_clean_cue_tags(self): vtt = webvtt.read(self._get_file('cue_tags.vtt')) self.assertEqual( vtt.captions[1].text, 'Like a big-a pizza pie' ) self.assertEqual( vtt.captions[2].text, 'That\'s amore' )
def test_parse_with_comments(self): vtt = webvtt.read(self._get_file('comments.vtt')) self.assertEqual(len(vtt.captions), 3) self.assertListEqual( vtt.captions[0].lines, ['- Ta en kopp varmt te.', '- Det är inte varmt.'] ) self.assertEqual( vtt.captions[2].text, '- Ta en kopp' )
def test_save_updated_identifiers(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR) vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt')) vtt.captions[0].identifier = 'first caption' vtt.captions[1].identifier = None vtt.captions[3].identifier = '44' last_caption = Caption('00:00:27.280', '00:00:29.200', 'Caption text #7') last_caption.identifier = 'last caption' vtt.captions.append(last_caption) vtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt')) with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', 'first caption', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'Caption text #2', '', '00:00:11.890 --> 00:00:16.320', 'Caption text #3', '', '44', '00:00:16.320 --> 00:00:21.580', 'Caption text #4', '', '00:00:21.580 --> 00:00:23.880', 'Caption text #5', '', '00:00:23.880 --> 00:00:27.280', 'Caption text #6', '', 'last caption', '00:00:27.280 --> 00:00:29.200', 'Caption text #7' ] self.assertListEqual(lines, expected_lines)
def test_save_captions(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.vtt'), OUTPUT_DIR) vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt')) new_caption = Caption('00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2']) vtt.captions.append(new_caption) vtt.save() with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1', 'New caption text line2' ] self.assertListEqual(lines, expected_lines)
def test_parse_captions_with_bom(self): vtt = webvtt.read(self._get_file('captions_with_bom.vtt')) self.assertEqual(len(vtt.captions), 4)
def test_captions(self): vtt = webvtt.read(self._get_file('sample.vtt')) self.assertIsInstance(vtt.captions, list)
def test_metadata_headers_multiline(self): vtt = webvtt.read(self._get_file('metadata_headers_multiline.vtt')) self.assertEqual(len(vtt.captions), 2)
def test_webvtt__parse_captions(self): self.assertTrue(webvtt.read(self._get_file('sample.vtt')).captions)
def test_webvtt_parse_get_captions(self): self.assertEqual( len(webvtt.read(self._get_file('sample.vtt')).captions), 16 )
def test_webvtt_timestamps_format(self): vtt = webvtt.read(self._get_file('sample.vtt')) self.assertEqual(vtt.captions[2].start, '00:00:11.890') self.assertEqual(vtt.captions[2].end, '00:00:16.320')
def test_webvtt_caption_without_cue_text(self): vtt = webvtt.read(self._get_file('missing_caption_text.vtt')) self.assertEqual(len(vtt.captions), 5)
def test_save_to_other_location(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) webvtt.read(self._get_file('one_caption.vtt')).save(target_path) self.assertTrue(os.path.exists(os.path.join(target_path, 'one_caption.vtt')))
def test_webvtt_total_length(self): self.assertEqual( webvtt.read(self._get_file('sample.vtt')).total_length, 64 )
import webvtt prevline = "HI I AM SHOUTING." captions = [] from os import listdir files = listdir("./subs/") for vttFile in files: output = open("./subs/" + vttFile + ".txt", 'w') for caption in webvtt.read("./subs/"+vttFile): #print(caption.start) #print(caption.end) for line in caption.text.split("\n"): if line == " ": continue elif line != prevline: output.write(line) output.write('\n') prevline = line output.close() """ print(captions[0]) print("----") print(captions[1].split('\n')) print("----") print(captions[2])
def test_sequence_iteration(self): vtt = webvtt.read(self._get_file('sample.vtt')) self.assertIsInstance(vtt[0], Caption) self.assertEqual(len(vtt), len(vtt.captions))