def test_sbv_get_caption_text_multiline(self): vtt = webvtt.from_sbv(self._get_file('sample.sbv')) self.assertEqual(vtt.captions[2].text, 'Caption text #3 (line 1)\nCaption text #3 (line 2)') self.assertListEqual( vtt.captions[2].lines, ['Caption text #3 (line 1)', 'Caption text #3 (line 2)'])
def parse_sbv(sbv_path): """ To get around the webvtt library not supporting empty subtitles """ data = None with open(sbv_path, "r", encoding="utf-8") as f: data = f.readlines() data_out = [] lines = [] for d in data: if d != "\n": lines.append(d) continue if len(lines) >= 2: data_out.extend(lines) data_out.append(d) lines = [] if len(lines) >= 2: data_out.extend(lines) out_path = path.join(TMP_DIR, path.basename(sbv_path) + ".tmp") with open(out_path, "w", encoding="utf-8") as f: for d in data_out: f.write(d) return webvtt.from_sbv(out_path)
def test_sbv_conversion(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('two_captions.sbv'), OUTPUT_DIR) vtt = webvtt.from_sbv(os.path.join(OUTPUT_DIR, 'two_captions.sbv')) vtt.save() self.assertTrue( os.path.exists(os.path.join(OUTPUT_DIR, 'two_captions.vtt'))) with open(os.path.join(OUTPUT_DIR, 'two_captions.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.378 --> 00:00:11.378', 'Caption text #1', '', '00:00:11.378 --> 00:00:12.305', 'Caption text #2 (line 1)', 'Caption text #2 (line 2)', ] self.assertListEqual(lines, expected_lines)
def get_subtitle_file(filename: str) -> WebVTT: file_ext = os.path.splitext(filename)[1] if file_ext == ".srt": return webvtt.from_srt(filename) elif file_ext == ".sbv": return webvtt.from_sbv(filename) elif file_ext == ".vtt": return webvtt.read(filename) else: raise ValueError(filename)
def test_sbv_get_caption_text_multiline(self): vtt = webvtt.from_sbv(self._get_file('sample.sbv')) self.assertEqual( vtt.captions[2].text, 'Caption text #3 (line 1)\nCaption text #3 (line 2)' ) self.assertListEqual( vtt.captions[2].lines, ['Caption text #3 (line 1)', 'Caption text #3 (line 2)'] )
def sbv2df(sbv,textCol): """ Store (start, end, and text) of each time segment in the sbv file in a row of a pandas dataframe. Input args sbv (string): the file path of an sbv file textCol (string): the name of the text column """ data = [] global webvtt webvtt = webvtt.from_sbv(sbv) for caption in webvtt: data.append({'start':datetime.strptime(caption.start,'%H:%M:%S.%f').time(), 'end':datetime.strptime(caption.end,'%H:%M:%S.%f').time(), textCol:caption.text}) df = pd.DataFrame(data) df = df.replace('\n',' ', regex=True) df = df[['start','end',textCol]] return df
def fix_subtitle_sequencing(filename): if os.path.isfile(filename + ".bk"): print("Not overwriting original backup for {}, skipping.".format( filename)) return subs = None if os.path.splitext(filename)[1] == ".srt": subs = webvtt.from_srt(filename) elif os.path.splitext(filename)[1] == ".sbv": subs = webvtt.from_sbv(filename) # Adjust timing and stretch subtitles for fixing the live ones which # get messed up by Youtube if "--fix-live" in sys.argv: for i in range(len(subs)): start = parse_time_stamp(subs[i].start) start -= timedelta(seconds=8) if start < timedelta(hours=0, minutes=0, seconds=0, milliseconds=0): start = timedelta(hours=0, minutes=0, seconds=0, milliseconds=0) end = start + timedelta(seconds=4) subs[i].start = format_time_stamp(start) subs[i].end = format_time_stamp(end) for i in range(len(subs) - 1): end = parse_time_stamp(subs[i].end) next_start = parse_time_stamp(subs[i + 1].start) if end > next_start: subs[i].end = subs[i + 1].start if not "--dry" in sys.argv: shutil.copy(filename, filename + ".bk") out_srt = os.path.splitext(filename)[0] + ".srt" with open(out_srt, "w", encoding="utf8") as f: subs.write(f, format="srt")
def test_sbv_conversion(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('two_captions.sbv'), OUTPUT_DIR) vtt = webvtt.from_sbv(os.path.join(OUTPUT_DIR, 'two_captions.sbv')) vtt.save() self.assertTrue(os.path.exists(os.path.join(OUTPUT_DIR, 'two_captions.vtt'))) with open(os.path.join(OUTPUT_DIR, 'two_captions.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.378 --> 00:00:11.378', 'Caption text #1', '', '00:00:11.378 --> 00:00:12.305', 'Caption text #2 (line 1)', 'Caption text #2 (line 2)', ] self.assertListEqual(lines, expected_lines)
def test_sbv_timestamps_in_seconds(self): vtt = webvtt.from_sbv(self._get_file('sample.sbv')) self.assertEqual(vtt.captions[1].start_in_seconds, 11.378) self.assertEqual(vtt.captions[1].end_in_seconds, 12.305)
def test_sbv_missing_caption_text(self): self.assertTrue( webvtt.from_sbv( self._get_file('missing_caption_text.sbv')).captions)
def test_sbv_get_caption_text(self): vtt = webvtt.from_sbv(self._get_file('sample.sbv')) self.assertEqual(vtt.captions[1].text, 'Caption text #2')
def test_sbv_timestamps_format(self): vtt = webvtt.from_sbv(self._get_file('sample.sbv')) self.assertEqual(vtt.captions[1].start, '00:00:11.378') self.assertEqual(vtt.captions[1].end, '00:00:12.305')
def test_sbv_total_length(self): self.assertEqual( webvtt.from_sbv(self._get_file('sample.sbv')).total_length, 16 )
def test_sbv_total_length(self): self.assertEqual( webvtt.from_sbv(self._get_file('sample.sbv')).total_length, 16)