def vtt_to_df(fn): """ Convert vtt to DataFrame args: fn - filepath to .vtt-file returns: DataFrame """ with open(fn) as f: text = f.read() vtt = WebVTTReader().read(text) subtitles = [] for caption in vtt.get_captions('en-US'): subtitles.append({ 'time': dt.datetime.strptime(caption.format_start(), '%H:%M:%S.%f').strftime('%-Hh%mm%Ss'), 'start': int((dt.datetime.strptime(caption.format_start(), '%H:%M:%S.%f') - dt.datetime(1900, 1, 1)).total_seconds()), 'duration': (caption.end - caption.start) / 100000, 'text': caption.get_text() }) df = pd.DataFrame(subtitles) return df
def test_invalid_files(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE Cues without text are invalid.\n" u"00:00:20,000 --> 00:00:30,000\n" u"\n" u"00:00:40,000 --> 00:00:50,000\n" u"foo bar baz\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") )
def test_ignoring_timing_errors(self): # Even if timing errors are ignored, this has to raise an exception with pytest.raises(CaptionReadSyntaxError): WebVTTReader().read( "\nNOTE invalid cue stamp\n00:00:20.000 --> \nfoo bar baz\n") # And this too with pytest.raises(CaptionReadSyntaxError): WebVTTReader().read("\n00:00:20,000 --> 00:00:22,000\n" "Note the comma instead of point.\n") # todo: at this point it can be split into 2 separate tests try: WebVTTReader().read("\n" "00:00:20.000 --> 00:00:10.000\n" "Start time is greater than end time.\n") except CaptionReadError: pytest.fail("Shouldn't raise CaptionReadError") try: WebVTTReader().read("\n" "00:00:20.000 --> 00:00:30.000\n" "Start times should be consecutive.\n" "\n" "00:00:10.000 --> 00:00:20.000\n" "This cue starts before the previous one.\n") except CaptionReadError: pytest.fail("Shouldn't raise CaptionReadError")
def test_not_ignoring_timing_errors(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader(ignore_timing_errors=False).read, (u"\n" u"00:00:20,000 --> 00:00:10,000\n" u"foo bar baz") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") )
def get_captions_from_output(self, output: str, language: str = 'en') -> str: reader = WebVTTReader() temp_final = [] for caption in reader.read(output, language).get_captions(language): stripped = str(caption).split("\\n")[-1].replace("'", '') timestamp = self.get_time_from_caption(str(caption)) temp_final.append(",".join([timestamp[0], timestamp[1], stripped])) lst = [[], [], []] for c in temp_final: for item in range(len(c.split(","))): lst[item].append(c.split(",")[item]) df = pd.DataFrame({'start': lst[0], 'end': lst[1], 'content': lst[2]}) # final = '' # previous = '' # for line in temp_final.split("\n"): # if previous != line: # final += "," + line # previous = line return df
def test_ignoring_timing_errors(self): # Even if timing errors are ignored, this is worse self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE invalid cue stamp\n" u"00:00:20,000 --> \n" u"foo bar baz\n") ) try: WebVTTReader().read( (u"\n" u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") try: WebVTTReader().read( (u"\n" u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError")
def test_ignoring_timing_errors(self): # Even if timing errors are ignored, this has to raise an exception self.assertRaises(CaptionReadSyntaxError, WebVTTReader().read, ("\nNOTE invalid cue stamp\n" "00:00:20.000 --> \n" "foo bar baz\n")) # And this too self.assertRaises(CaptionReadSyntaxError, WebVTTReader().read, ("\n00:00:20,000 --> 00:00:22,000\n" "Note the comma instead of point.\n")) try: WebVTTReader().read(("\n" "00:00:20.000 --> 00:00:10.000\n" "Start time is greater than end time.\n")) except CaptionReadError: self.fail("Shouldn't raise CaptionReadError") try: WebVTTReader().read(("\n" "00:00:20.000 --> 00:00:30.000\n" "Start times should be consecutive.\n" "\n" "00:00:10.000 --> 00:00:20.000\n" "This cue starts before the previous one.\n")) except CaptionReadError: self.fail("Shouldn't raise CaptionReadError")
def get_subs(vtt_subs_path): subs = [] reader = WebVTTReader() with open(vtt_subs_path, 'r') as f: text = f.read().decode(utils.get_file_encoding(vtt_subs_path)) vtt = reader.read(text) vttsubs = vtt.get_captions(vtt.get_languages()[0]) #vttsubs = pyvtt.WebVTTFile.open(vtt_subs_path) print "vttsubs total: %i " % len(vttsubs) print vttsubs[0].start print vttsubs[0].end print vttsubs[0].get_text() for s in vttsubs: subs.append({ "text": s.get_text(), "start": float(s.start) / 1000000, "end": float(s.end) / 1000000 }) return subs
def test_invalid_files(self): self.assertRaises(CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:10.000\n" u"Start time is greater than end time.")) self.assertRaises(CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:30.000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10.000 --> 00:00:20.000\n" u"This cue starts before the previous one.\n"))
def test_invalid_files(self): with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "00:00:20.000 --> 00:00:10.000\n" "Start time is greater than end time.") with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "00:00:20.000 --> 00:00:30.000\n" "Start times should be consecutive.\n" "\n" "00:00:10.000 --> 00:00:20.000\n" "This cue starts before the previous one.\n")
def get_captions_from_output(self, output: str, url: str) -> str: reader = WebVTTReader() captions = [] for caption in reader.read(output).get_captions('en-US'): stripped = self.remove_time_from_caption( url, str(caption).replace(r'\n', " ")) stripped += "\n" captions.append(stripped) if self.search_query == '': return ''.join(item for item in captions) return self.process_captions(captions, url)
def test_webvtt_to_microdvd_conversion(self, sample_microdvd, sample_webvtt): caption_set = WebVTTReader().read(sample_webvtt) results = MicroDVDWriter().write(caption_set) assert isinstance(results, str) self.assert_microdvd_equals(sample_microdvd, results)
def test_webvtt_to_webvtt_conversion(self, sample_webvtt_from_webvtt, sample_webvtt): caption_set = WebVTTReader().read(sample_webvtt) results = WebVTTWriter().write(caption_set) assert isinstance(results, str) self.assert_webvtt_equals(sample_webvtt_from_webvtt, results)
def test_positioning_is_kept(self, sample_webvtt_from_dfxp_with_positioning): caption_set = WebVTTReader().read( sample_webvtt_from_dfxp_with_positioning) results = WebVTTWriter().write(caption_set) assert sample_webvtt_from_dfxp_with_positioning == results
def test_webvtt_to_dfxp_conversion(self): caption_set = WebVTTReader().read(SAMPLE_WEBVTT.decode(u'utf-8')) results = DFXPWriter().write(caption_set) self.assertTrue(isinstance(results, unicode)) self.assertDFXPEquals(SAMPLE_DFXP_UNICODE, results, ignore_styling=True, ignore_spans=True)
def get_captions_from_output(self, output: str) -> str: reader = WebVTTReader() temp_final = '' for caption in reader.read(output).get_captions('en-US'): stripped = self.remove_time_from_caption( str(caption).replace(r'\n', "\n")) temp_final += stripped final = '' previous = '' for line in temp_final.split("\n"): if previous != line: final += "\n" + line previous = line return final.replace("\n", ' ')[1:]
def test_webvtt_to_dfxp_conversion(self): caption_set = WebVTTReader().read(SAMPLE_WEBVTT) results = DFXPWriter().write(caption_set) self.assertTrue(isinstance(results, six.text_type)) self.assertDFXPEquals(SAMPLE_DFXP, results, ignore_styling=True, ignore_spans=True)
def convert_vtt_to_srt(dir): for vtt_file in glob.glob(os.path.join(dir, "*.vtt")): with open(os.path.splitext(vtt_file)[0] + '.srt', 'w') as srt: vtt = open(vtt_file, 'r') vttsub = vtt.read().decode('UTF-8') srtsub = SRTWriter().write(WebVTTReader().read(vttsub)) srt.write(srtsub.encode('UTF-8')) vtt.close() os.remove(vtt_file)
def test_webvtt_to_dfxp_conversion(self, sample_dfxp, sample_webvtt): caption_set = WebVTTReader().read(sample_webvtt) results = DFXPWriter().write(caption_set) assert isinstance(results, str) self.assert_dfxp_equals(sample_dfxp, results, ignore_styling=True, ignore_spans=True)
def test_not_ignoring_timing_errors(self): # todo: same assert w/ different arguments -> this can be parametrized; with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "\n" "00:00:20.000 --> 00:00:10.000\n" "foo bar baz") with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "00:00:20.000 --> 00:00:10.000\n" "Start time is greater than end time.\n") with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "00:00:20.000 --> 00:00:30.000\n" "Start times should be consecutive.\n" "\n" "00:00:10.000 --> 00:00:20.000\n" "This cue starts before the previous one.\n")
def get(content_id, lang): args = { 'content_id': content_id } CClist = requests.get(api_url, params = args) CClink = etree.HTML(CClist.content).xpath('//transcripts/' + lang + '/text()')[0].replace('captions','captions_webvtt').replace('smi','vtt') origCC = requests.get(CClink) srtCC = SRTWriter().write(WebVTTReader().read(origCC.text)) srt_file = open(content_id + '.' + lang + '.srt', 'w') srt_file.write(srtCC.replace('\n', '\r\n').encode('utf-8')) srt_file.close() return 0
def getCaptions(url, progress_cb, so_far, task_weight): ydl = youtube_dl.YoutubeDL({ 'writesubtitles': True, 'allsubtitles': True, 'writeautomaticsub': True }) with ydl: res = ydl.extract_info(url, download=False) if res['requested_subtitles'] and res['requested_subtitles']['en']: print('Grabbing vtt file from ' + res['requested_subtitles']['en']['url']) response = requests.get(res['requested_subtitles']['en']['url'], stream=True) b = BytesIO() for block in response.iter_content(1024): b.write(block) b.seek(0) arr = WebVTTReader().read(b.read().decode('ascii')) progress_cb(so_far + task_weight, so_far + task_weight) return arr.get_captions('en-US') else: return [] print('Youtube Video does not have any english captions') return None
class WebVTTReaderTestCase(unittest.TestCase): def setUp(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self): self.assertTrue(self.reader.detect(SAMPLE_WEBVTT)) def test_negative_answer_for_detection(self): self.assertFalse(self.reader.detect(SAMPLE_SRT)) def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT) self.assertEqual(len(captions.get_captions('en-US')), 7) def test_read_supports_multiple_languages(self): captions = self.reader.read(SAMPLE_WEBVTT, lang='es') self.assertIsNotNone(captions.get_captions('es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT) cue = captions.get_captions('en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._clean( "\n" # the first line is sckipped by the cleaner "<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " "its shortcomings</i>, but it is<u> the largest</u> collective " "knowledge construction endevour</c> <ruby>base text <rt>" "annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!" ) expected = ( "Wikipedia is a great adventure. It may have " "its shortcomings, but it is the largest collective " "knowledge construction endevour base text annotation" " Audry: Yes, indeed!" ) self.assertEqual(result, expected) def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY)
def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8'))
def setUp(self): self.reader = WebVTTReader()
def test_positioning_is_kept(self): caption_set = WebVTTReader().read( SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING) results = WebVTTWriter().write(caption_set) self.assertEqual(SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING, results)
class WebVTTReaderTestCase(unittest.TestCase): def setUp(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self): self.assertTrue(self.reader.detect(SAMPLE_WEBVTT.decode(u'utf-8'))) def test_negative_answer_for_detection(self): self.assertFalse(self.reader.detect(SAMPLE_SRT.decode(u'utf-8'))) def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT_2.decode(u'utf-8')) self.assertEqual(len(captions.get_captions(u'en-US')), 7) def test_read_supports_multiple_languages(self): captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'), lang=u'es') self.assertIsNotNone(captions.get_captions(u'es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8')) cue = captions.get_captions(u'en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._remove_styles( u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " u"its shortcomings</i>, but it is<u> the largest</u> collective " u"knowledge construction endevour</c> <ruby>base text <rt>" u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!" ) expected = ( u"Wikipedia is a great adventure. It may have " u"its shortcomings, but it is the largest collective " u"knowledge construction endevour base text annotation" u" Audry: Yes, indeed!" ) self.assertEqual(result, expected) def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8')) def test_invalid_files(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, u""" NOTE Cues without text are invalid. 00:00:20,000 --> 00:00:10,000 """ ) self.assertRaises( CaptionReadError, WebVTTReader().read, u""" 00:00:20,000 --> 00:00:10,000 Start time is greater than end time. """ ) self.assertRaises( CaptionReadError, WebVTTReader().read, u""" 00:00:20,000 --> 00:00:30,000 Start times should be consecutive. 00:00:10,000 --> 00:00:20,000 This cue starts before the previous one. """ )
class WebVTTReaderTestCase(unittest.TestCase): def setUp(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self): self.assertTrue(self.reader.detect(SAMPLE_WEBVTT)) def test_negative_answer_for_detection(self): self.assertFalse(self.reader.detect(SAMPLE_SRT)) def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT_2) self.assertEqual(len(captions.get_captions(u'en-US')), 7) def test_read_supports_multiple_languages(self): captions = self.reader.read(SAMPLE_WEBVTT, lang=u'es') self.assertIsNotNone(captions.get_captions(u'es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT) cue = captions.get_captions(u'en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._remove_styles( u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " u"its shortcomings</i>, but it is<u> the largest</u> collective " u"knowledge construction endevour</c> <ruby>base text <rt>" u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!" ) expected = ( u"Wikipedia is a great adventure. It may have " u"its shortcomings, but it is the largest collective " u"knowledge construction endevour base text annotation" u" Audry: Yes, indeed!" ) self.assertEqual(result, expected) def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY) def test_not_ignoring_timing_errors(self): self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"\n" u"00:00:20.000 --> 00:00:10.000\n" u"foo bar baz") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:10.000\n" u"Start time is greater than end time.\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:30.000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10.000 --> 00:00:20.000\n" u"This cue starts before the previous one.\n") ) def test_ignoring_timing_errors(self): # Even if timing errors are ignored, this has to raise an exception self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE invalid cue stamp\n" u"00:00:20.000 --> \n" u"foo bar baz\n") ) # And this too self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\n00:00:20,000 --> 00:00:22,000\n" u"Note the comma instead of point.\n") ) try: WebVTTReader().read( (u"\n" u"00:00:20.000 --> 00:00:10.000\n" u"Start time is greater than end time.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") try: WebVTTReader().read( (u"\n" u"00:00:20.000 --> 00:00:30.000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10.000 --> 00:00:20.000\n" u"This cue starts before the previous one.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") def test_invalid_files(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE Cues without text are invalid.\n" u"00:00:20.000 --> 00:00:30.000\n" u"\n" u"00:00:40.000 --> 00:00:50.000\n" u"foo bar baz\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:10.000\n" u"Start time is greater than end time.") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:30.000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10.000 --> 00:00:20.000\n" u"This cue starts before the previous one.\n") ) def test_zero_start(self): captions = self.reader.read(SAMPLE_WEBVTT_LAST_CUE_ZERO_START) cue = captions.get_captions(u'en-US')[0] self.assertEquals(cue.start, 0)
def test_cue_settings_are_kept(self): caption_set = WebVTTReader().read(SAMPLE_WEBVTT_WITH_CUE_SETTINGS) webvtt = WebVTTWriter().write(caption_set) self.assertEqual(SAMPLE_WEBVTT_WITH_CUE_SETTINGS, webvtt)
def test_empty_cues_are_deleted(self): caption_set = WebVTTReader().read(SAMPLE_WEBVTT_EMPTY_CUE) results = WebVTTWriter().write(caption_set) self.assertEqual(SAMPLE_WEBVTT_FROM_EMPTY_CUE, results)
def setUpClass(cls): cls.captions = WebVTTReader().read(SAMPLE_WEBVTT.decode(u'utf-8'))
def test_webvtt_to_srt_conversion(self): caption_set = WebVTTReader().read(SAMPLE_WEBVTT) results = SRTWriter().write(caption_set) self.assertTrue(isinstance(results, six.text_type)) self.assertSRTEquals(SAMPLE_SRT, results)
class WebVTTReaderTestCase(unittest.TestCase): def setUp(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self): self.assertTrue(self.reader.detect(SAMPLE_WEBVTT.decode(u'utf-8'))) def test_negative_answer_for_detection(self): self.assertFalse(self.reader.detect(SAMPLE_SRT.decode(u'utf-8'))) def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT_2.decode(u'utf-8')) self.assertEqual(len(captions.get_captions(u'en-US')), 7) def test_read_supports_multiple_languages(self): captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'), lang=u'es') self.assertIsNotNone(captions.get_captions(u'es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8')) cue = captions.get_captions(u'en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._remove_styles( u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " u"its shortcomings</i>, but it is<u> the largest</u> collective " u"knowledge construction endevour</c> <ruby>base text <rt>" u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!" ) expected = ( u"Wikipedia is a great adventure. It may have " u"its shortcomings, but it is the largest collective " u"knowledge construction endevour base text annotation" u" Audry: Yes, indeed!" ) self.assertEqual(result, expected) def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8')) def test_not_ignoring_timing_errors(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader(ignore_timing_errors=False).read, (u"\n" u"00:00:20,000 --> 00:00:10,000\n" u"foo bar baz") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") ) def test_ignoring_timing_errors(self): # Even if timing errors are ignored, this is worse self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE invalid cue stamp\n" u"00:00:20,000 --> \n" u"foo bar baz\n") ) try: WebVTTReader().read( (u"\n" u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") try: WebVTTReader().read( (u"\n" u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") def test_invalid_files(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE Cues without text are invalid.\n" u"00:00:20,000 --> 00:00:30,000\n" u"\n" u"00:00:40,000 --> 00:00:50,000\n" u"foo bar baz\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") )