def middleware_convert_sub(response, **kwargs): data = response.stream.content.decode('utf8') reader = detect_format(data) if reader: data = WebVTTWriter().write(reader().read(data)) response.stream.content = data.encode('utf8') response.headers['content-type'] = 'text/vtt'
def test_lang_option(self, sample_webvtt_multi_lang_en, sample_webvtt_multi_lang_de, sample_sami_with_multi_lang): caption_set = SAMIReader().read(sample_sami_with_multi_lang) results = WebVTTWriter().write(caption_set, 'de-DE') assert sample_webvtt_multi_lang_de == results results = WebVTTWriter().write(caption_set, 'en-US') assert sample_webvtt_multi_lang_en == results
def _webvtt(url, _data_path, _headers, **kwargs): r = Session().get(url, headers=_headers) data = r.content.decode('utf8') reader = detect_format(data) data = WebVTTWriter().write(reader().read(data)) with open(_data_path, 'wb') as f: f.write(data.encode('utf8')) return _data_path + '|content-type=text/vtt'
def __init__(self, readers, caption_str): """ :param readers: An array of `SubtitleReader` instances :param caption_str: A string with the captions content """ self.readers = readers self.caption_str = caption_str self.writer = WebVTTWriter() # set "video size" to 100 since other types may have layout, 100 should work to generate % self.writer.video_width = 100 self.writer.video_height = self.writer.video_width * 6 / 19 self.caption_set = None
def middleware_convert_sub(response, **kwargs): data = response.stream.content.decode('utf8') reader = detect_format(data) if reader: data = WebVTTWriter().write(reader().read(data)) if ADDON_DEV: path = 'special://temp/convert_sub.middleware' real_path = xbmc.translatePath(path) with open(real_path, 'wb') as f: f.write(data.encode('utf8')) response.stream.content = data.encode('utf8') response.headers['content-type'] = 'text/vtt'
def test_break_node_positioning_is_ignored( self, webvtt_from_dfxp_with_conflicting_align, dfxp_style_region_align_conflict): caption_set = DFXPReader().read(dfxp_style_region_align_conflict) results = WebVTTWriter().write(caption_set) assert webvtt_from_dfxp_with_conflicting_align == results
def test_dfxp_with_positioning_to_webvtt_conversion(self): caption_set = DFXPReader().read(SAMPLE_DFXP_WITH_POSITIONING) results = WebVTTWriter(video_width=VIDEO_WIDTH, video_height=VIDEO_HEIGHT).write(caption_set) self.assertTrue(isinstance(results, str)) self.assertWebVTTEquals( SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING_AND_STYLE, results)
def test_positioning_is_kept(self, sample_webvtt_from_dfxp_with_positioning): caption_set = WebVTTReader().read( sample_webvtt_from_dfxp_with_positioning) results = WebVTTWriter().write(caption_set) assert sample_webvtt_from_dfxp_with_positioning == results
def test_dfxp_to_webvtt_conversion(self, sample_webvtt_from_dfxp, sample_dfxp): caption_set = DFXPReader().read(sample_dfxp) results = WebVTTWriter().write(caption_set) assert isinstance(results, str) self.assert_webvtt_equals(sample_webvtt_from_dfxp, results)
def load_subtitles(self, video_id, langs=('ru',)): for lang in langs: if subs_exists(video_id, lang): continue print('Loading {} subtitles for {}'.format(lang, video_id)) opts = { 'writeautomaticsub': True, 'subtitleslangs': langs, 'subtitlesformat': 'ttml', 'nooverwrites': True, 'skip_download': True, 'outtmpl': join(get_dir(video_id), video_id + '.ttml') } with youtube_dl.YoutubeDL(opts) as ytdl: ytdl.download(['https://www.youtube.com/watch?v={}'.format(video_id)]) # WevVTT captions from youtube contains duplicate phrases with overlapping time segments # It is not comfortable, that's why subtitles firstly downloaded in ttml format # Then subtitles converted to webvtt subs_path_ttml = join(get_dir(video_id), video_id + '.' + lang + '.ttml') subs_path_vtt = join(get_dir(video_id), video_id + '.' + lang + '.vtt') if exists(subs_path_ttml): print('converting subtitles') with open(subs_path_ttml, encoding='utf-8') as f: subs = DFXPReader().read(f.read()) with open(subs_path_vtt, 'w', encoding='utf-8') as f: f.write(WebVTTWriter().write(subs))
def test_webvtt_newlines_are_properly_rendered(self): caption_set = SCCReader().read( SCC_THAT_GENERATES_WEBVTT_WITH_PROPER_NEWLINES) webvtt = WebVTTWriter().write(caption_set) self.assertEqual( webvtt, SAMPLE_WEBVTT_FROM_SCC_PROPERLY_WRITES_NEWLINES_OUTPUT)
def test_dfxp_to_webvtt_preserves_proper_alignment(self): # This failed at one point when the CaptionSet had node breaks with # different positioning. It was fixed both at the DFXPReader AND the # WebVTTWriter. caption_set = DFXPReader().read(DFXP_STYLE_REGION_ALIGN_CONFLICT) results = WebVTTWriter().write(caption_set) self.assertEquals(WEBVTT_FROM_DFXP_WITH_CONFLICTING_ALIGN, results)
def test_srt_to_webvtt_conversion(self, sample_webvtt_from_srt, sample_srt): caption_set = SRTReader().read(sample_srt) results = WebVTTWriter().write(caption_set) assert isinstance(results, str) self.assert_webvtt_equals(sample_webvtt_from_srt, results)
def test_dfxp_to_webvtt_adds_explicit_size( self, sample_webvtt_output_long_cue, sample_dfxp_long_cue): caption_set = DFXPReader().read(sample_dfxp_long_cue) results = WebVTTWriter().write(caption_set) assert isinstance(results, str) assert sample_webvtt_output_long_cue == results
class WebVTTWriterTestCase(unittest.TestCase): def setUp(self): self.writer = WebVTTWriter() def test_double_br(self): captions = SAMIReader().read(SAMPLE_SAMI_DOUBLE_BR.decode(u'utf-8')) self.assertEqual(SAMPLE_WEBVTT_DOUBLE_BR.decode(u'utf-8'), self.writer.write(captions))
def test_sami_to_webvtt_conversion(self, sample_webvtt_from_sami, sample_sami): caption_set = SAMIReader().read(sample_sami) results = WebVTTWriter(video_width=640, video_height=360).write(caption_set) assert isinstance(results, str) self.assert_webvtt_equals(sample_webvtt_from_sami, results)
def test_webvtt_newlines_are_properly_rendered( self, sample_webvtt_from_scc_properly_writes_newlines_output, scc_that_generates_webvtt_with_proper_newlines): caption_set = SCCReader().read( scc_that_generates_webvtt_with_proper_newlines) webvtt = WebVTTWriter().write(caption_set) assert webvtt == sample_webvtt_from_scc_properly_writes_newlines_output
def convert_subs_to_vtt(input_subs_path, output_vtt_path): with open(input_subs_path, 'r') as f: text = f.read().decode(utils.get_file_encoding(input_subs_path)) reader = detect_format(text) subs = reader().read(text) output_text = WebVTTWriter().write(subs) with open(output_vtt_path, 'w') as w: w.write(output_text)
def test_dfxp_to_webvtt_preserves_proper_alignment( self, webvtt_from_dfxp_with_conflicting_align, dfxp_style_region_align_conflict): # This failed at one point when the CaptionSet had node breaks with # different positioning. It was fixed both at the DFXPReader AND the # WebVTTWriter. caption_set = DFXPReader().read(dfxp_style_region_align_conflict) results = WebVTTWriter().write(caption_set) assert webvtt_from_dfxp_with_conflicting_align == results
def fetch_subtitles(entry, lang='ru'): requested_subtitles = entry['automatic_captions'] if requested_subtitles: title = entry['title'] video_id = entry['id'] url = requested_subtitles[lang][0]['url'] text = requests.get(url).content.decode() vtt = WebVTTWriter().write(DFXPReader().read(text)) return video_id, title, vtt
def test_dfxp_with_positioning_to_webvtt_conversion( self, sample_webvtt_from_dfxp_with_positioning_and_style, sample_dfxp_with_positioning): caption_set = DFXPReader().read(sample_dfxp_with_positioning) results = WebVTTWriter( video_width=VIDEO_WIDTH, video_height=VIDEO_HEIGHT ).write(caption_set) assert isinstance(results, str) self.assert_webvtt_equals( sample_webvtt_from_dfxp_with_positioning_and_style, results )
def route_subtitles(course_id, lecture_id): subtitles_url = ( 'https://class.coursera.org/%s-001/lecture/subtitles?q=%d_en' % (course_id, lecture_id)) r = requests.get(subtitles_url) try: converter = CaptionConverter() converter.read(r.text, SRTReader()) subtitles = converter.write(WebVTTWriter()) except CaptionReadNoCaptions: subtitles = '' return Response(subtitles, content_type='text/vtt')
def convert_caps_to_vtt(caps): """ Utility method to convert any supported transcripts into WebVTT format. Supported input formats: DFXP/TTML - SAMI - SCC - SRT - WebVTT. Arguments: caps (unicode): Raw transcripts. Returns: unicode: Transcripts converted into WebVTT format. """ if caps: reader = detect_format(caps) if reader: return WebVTTWriter().write(reader().read(caps)) return u''
def subtitle(request, title, no): t = re.sub('\(.*?\)', '', title)[:-1] film = subscene.search(t, "English") zip = requests.get(subscene.zipped_url(film.subtitles[int(no)])) fp = StringIO(zip.content) archive = zipfile.ZipFile(fp, 'r') srt = archive.read(archive.namelist()[0]) soup = BeautifulSoup(srt) # print(soup.originalEncoding) converter = CaptionConverter() unistring = unicode(srt.decode(soup.originalEncoding)) if "utf-8" in soup.originalEncoding: unistring = unistring[1:] converter.read(unistring, SRTReader()) html_parser = HTMLParser.HTMLParser() return HttpResponse(html_parser.unescape(converter.write(WebVTTWriter()).encode('ascii', 'ignore')), content_type="text/vtt")
def convert_subtitles_to_vtt(input_file: str, output_file: str): """Convert .srt subtitles to .vtt for web playback.""" logger.info(f'Converting {input_file} to {output_file}') with open(input_file, mode='rb') as raw_input_content: encoding = chardet.detect(raw_input_content.read())['encoding'] with open(input_file, mode='r', encoding=encoding) as srt_file: srt_contents = str(srt_file.read()) converter = CaptionConverter() try: converter.read(srt_contents, SRTReader()) except CaptionReadNoCaptions: logger.exception(f'Failed to convert {input_file} to {output_file}') return False # Likely UTF-16 subtitles vtt_captions = converter.write(WebVTTWriter()) with open(output_file, mode='w', encoding='utf-8-sig') as vtt_file: vtt_file.write(vtt_captions) return True
def from_srt(input_f, output_f): """ Takes an input SRT file or filename and writes out VTT contents to the given output file or filename """ with vtt_open(input_f, 'r') as f: orig = f.read() detect = chardet.detect(orig) encoding = detect['encoding'] confidence = detect['confidence'] default_subrip_encoding = 'cp1252' # standard for SubRip files if confidence < 0.9: encoding = default_subrip_encoding backups = [default_subrip_encoding,'utf8'] while True: try: print "ENCODING: " + encoding contents = orig.decode(encoding) break except UnicodeDecodeError as e: if len(backups) is 0: raise break encoding = backups.pop(0) # caption converter seems to have a tough time with the BOM on # Python < 2.7.8, so ditch it if it exists. contents = contents[3:] if contents[:3] == codecs.BOM_UTF8 else contents converter = CaptionConverter() converter.read(contents, SRTReader()) contents = converter.write(WebVTTWriter()) with vtt_open(output_f, 'w') as o: o.write(contents.encode('utf-8')[:-1])
def run_pipeline(url=None, hmm=None, lm=None, dict=None, caption_format='webvtt', out_file=None): if url is None: raise Exception('No URL specified!') pipeline = Gst.parse_launch('uridecodebin name=source ! audioconvert !' + ' audioresample ! pocketsphinx name=asr !' + ' fakesink') source = pipeline.get_by_name('source') source.set_property('uri', url) pocketsphinx = pipeline.get_by_name('asr') if hmm: pocketsphinx.set_property('hmm', hmm) if lm: pocketsphinx.set_property('lm', lm) if dict: pocketsphinx.set_property('dict', dict) bus = pipeline.get_bus() # Start playing pipeline.set_state(Gst.State.PLAYING) cap_set = CaptionSet() captions = [] # Wait until error or EOS while True: try: msg = bus.timed_pop(Gst.CLOCK_TIME_NONE) if msg: #if msg.get_structure(): # print(msg.get_structure().to_string()) if msg.type == Gst.MessageType.EOS: break struct = msg.get_structure() if struct and struct.get_name() == 'pocketsphinx': if struct['final']: c = Caption() c.start = struct['start_time'] / Gst.USECOND c.end = struct['end_time'] / Gst.USECOND c.nodes.append( CaptionNode.create_text(struct['hypothesis'])) captions.append(c) except KeyboardInterrupt: pipeline.send_event(Gst.Event.new_eos()) # Free resources pipeline.set_state(Gst.State.NULL) cap_set.set_captions('en-US', captions) writer = SRTWriter() if caption_format == 'srt' else WebVTTWriter() caption_data = writer.write(cap_set) if out_file is not None: codecs.open(out_file, 'w', 'utf-8').write(caption_data) else: print(caption_data)
def test_empty_cues_are_deleted(self): caption_set = WebVTTReader().read(SAMPLE_WEBVTT_EMPTY_CUE) results = WebVTTWriter().write(caption_set) self.assertEqual(SAMPLE_WEBVTT_FROM_EMPTY_CUE, results)
def test_positioning_is_kept(self): caption_set = WebVTTReader().read( SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING) results = WebVTTWriter().write(caption_set) self.assertEqual(SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING, results)
def test_cue_settings_are_kept(self): caption_set = WebVTTReader().read(SAMPLE_WEBVTT_WITH_CUE_SETTINGS) webvtt = WebVTTWriter().write(caption_set) self.assertEqual(SAMPLE_WEBVTT_WITH_CUE_SETTINGS, webvtt)
def setUp(self): self.writer = WebVTTWriter()