def read_captions(captions, options): reader_kwargs = { 'read_invalid_positioning': options.read_invalid_positioning } scc_reader = pycaption.SCCReader(**reader_kwargs) srt_reader = pycaption.SRTReader(**reader_kwargs) sami_reader = pycaption.SAMIReader(**reader_kwargs) dfxp_reader = pycaption.DFXPReader(**reader_kwargs) vtt_reader = pycaption.WebVTTReader(**reader_kwargs) if scc_reader.detect(captions): if options.lang: return scc_reader.read(captions, lang=options.lang, offset=int(options.offset)) else: return scc_reader.read(captions, offset=int(options.offset)) elif srt_reader.detect(captions): return srt_reader.read(captions) elif sami_reader.detect(captions): return sami_reader.read(captions) elif dfxp_reader.detect(captions): return dfxp_reader.read(captions) elif vtt_reader.detect(captions): return vtt_reader.read(captions) else: raise Exception('No caption format detected :(')
def main(argv): inputfile = '' inputType = '' outputType = '' try: opts, args = getopt.getopt(argv, "h:i:f:t:") except getopt.GetoptError: print('test.py -i <inputfile> -f <intputType> -t <outputType>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -f <intputType> -t <outputType>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-f", "--sfile"): inputType = arg elif opt in ("-t", "--tfile"): outputType = arg if inputType == outputType: print('Error: input type and output type are same format') sys.exit(1) with io.open(inputfile) as f: str1 = f.read() inputValue = inputType.lower() if inputValue == 'scc': c = pycaption.SCCReader().read(str1) elif inputValue == 'srt': c = pycaption.SRTReader().read(str1) elif inputValue == 'dfxp': c = pycaption.DFXPReader().read(str1) elif inputValue == 'webvtt': c = pycaption.WebVTTReader().read(str1) else: print('Error: invalid input type. <srt/scc/webvtt/dfxp> allowed') sys.exit(1) outputValue = outputType.lower() if outputValue == 'scc': print(pycaption.SCCWriter().write(c)) elif outputValue == 'srt': print(pycaption.SRTWriter().write(c)) elif outputValue == 'dfxp': print(pycaption.DFXPWriter().write(c)) elif outputValue == 'webvtt': print(pycaption.WebVTTWriter().write(c)) else: print('Error: invalid output type. <srt/scc/webvtt/dfxp> allowed') sys.exit(1)
def download_subs(link, output_folder, name): x = requests.get( link, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:70.0) Gecko/20100101 Firefox/70.0' }) caption_set = pycaption.DFXPReader().read(x.text) results = pycaption.SRTWriter().write(caption_set) with io.open(os.path.join(output_folder, name + '.srt'), 'w', encoding='utf-8') as f: f.write(results)
def read_captions(captions, options): scc_reader = pycaption.SCCReader() srt_reader = pycaption.SRTReader() sami_reader = pycaption.SAMIReader() dfxp_reader = pycaption.DFXPReader() if scc_reader.detect(captions): if options.lang: return scc_reader.read(captions, lang=options.lang, offset=int(options.offset)) else: return scc_reader.read(captions, offset=float(options.offset)) elif srt_reader.detect(captions): return srt_reader.read(captions) elif sami_reader.detect(captions): return sami_reader.read(captions) elif dfxp_reader.detect(captions): return dfxp_reader.read(captions) else: raise Exception('No caption format detected :(')
def download_from_ism(self, url, output_name, output_format): r = self.session.get(f'{url}/manifest') manifest = xmltodict.parse(r.content, force_list={'StreamIndex', 'c'}) self.logger.debug(json.dumps(manifest, indent=4)) for (index, stream) in enumerate( manifest['SmoothStreamingMedia']['StreamIndex']): if stream['@Type'] != 'text': continue lang = stream['@Language'].lower() fmt = stream['QualityLevel']['@FourCC'].upper() if fmt != 'TTML': self.logger.error( f'Stream has unsupported subtitle format: {fmt!r}') sys.exit(1) index -= 2 output = f'{output_name.replace(" ", ".")}.{lang}.{index}.srt' output = pathvalidate.sanitize_filename(output) output = os.path.join(self.output_dir, output) self.logger.info(f'Saving subtitle track #{index} to {output}') path = stream['@Url'].replace('{bitrate}', stream['QualityLevel']['@Bitrate']) t = 0 ts = [] for c in stream['c']: if c.get('@t'): t = int(c['@t']) ts.append(t) if not c.get('@d'): # Stream only has a single segment break for i in range(c.get('@r', 1)): t += int(c['@d']) ts.append(t) ts = ts[:-1] # Remove nonexistent last segment xml = None for (i, t) in enumerate(ts): #print(f'\rDownloading: {t/ts[-1]:.0%}', end='') self.logger.debug(f'Downloading segment {i + 1} of {len(ts)}') seg_url = f'{url}/{path.replace("{start time}", str(t))}' seg = self.session.get(seg_url).content if not seg: # Empty segment continue data = self.ismt_to_ttml(seg).decode('utf-8') assert '{{BR}}' not in data, 'input data contains br placeholder' data = re.sub(r'<br ?/>', '{{BR}}', data) xml_seg = xmltodict.parse( data, force_list={'p'}, process_namespaces=True, namespaces={ 'http://www.w3.org/XML/1998/namespace': None, 'http://www.w3.org/2006/10/ttaf1': None, 'http://www.w3.org/2006/10/ttaf1#metadata': None, 'http://www.w3.org/2006/10/ttaf1#styling': None, }, ) if i == 0: xml = xml_seg fps_base = xml['tt'].get('@ttp:frameRate') fps_mult = xml['tt'].get('@ttp:frameRateMultiplier') if xml['tt']['body']['div'] is None: xml['tt']['body']['div'] = {'p': []} if fps_base: if fps_mult: mult = [int(x) for x in fps_mult.split(' ')] mult = truediv(*mult) else: mult = 1 fps = fps_base * fps_mult else: fps = 30 # Per TTML spec else: div = xml_seg['tt']['body']['div'] if div is None: # Empty subtitle file continue subs = div['p'] scale = int(stream['@TimeScale']) offset = t / scale for p in subs: for a in ('@begin', '@end'): tc = p[a] (h, m, s, f) = [int(x) for x in tc.split(':')] total = round( h * 3600 + m * 60 + s + f / fps + offset, 3) p[a] = f'{total}s' begin = float(p['@begin'][:-1]) end = float(p['@end'][:-1]) if end < begin: self.logger.error( f'End time is earlier than start time ({end} < {begin})', ) return xml['tt']['body']['div']['p'].extend(subs) xml_data = xmltodict.unparse(xml) xml_data = xml_data.replace('{{BR}}', '<br />') os.makedirs(self.output_dir, exist_ok=True) with open(output, 'wb') as fd: if output_format == 'ttml': fd.write(xml_data.encode('utf-8-sig')) elif output_format == 'srt': self.logger.debug('Converting to SRT') r = pycaption.DFXPReader().read(xml_data) w = pycaption.SRTWriter().write(r) fd.write(w.encode('utf-8-sig'))