def write_captions(content, options): writer_kwargs = { 'video_width': int(options.video_width) if options.video_width else None, # noqa 'video_height': int(options.video_height) if options.video_height else None # noqa } if options.sami: print( pycaption.SAMIWriter( **writer_kwargs).write(content).encode("utf-8")) # noqa if options.dfxp: print( pycaption.DFXPWriter( **writer_kwargs).write(content).encode("utf-8")) #noqa if options.srt: print( pycaption.SRTWriter( **writer_kwargs).write(content).encode("utf-8")) # noqa if options.transcript: print( pycaption.TranscriptWriter( **writer_kwargs).write(content).encode("utf-8")) # noqa if options.vtt: print( pycaption.WebVTTWriter( **writer_kwargs).write(content).encode("utf-8")) # noqa if options.unpositioned_dfxp: print( pycaption.dfxp.SinglePositioningDFXPWriter( **writer_kwargs).write(content).encode("utf-8"))
def mk_subs(self, transcriptions, sub_pathname): """ Create a subtitle file for this video. It is currently a huge hack, but it works good enough. transcriptions: list of start/end 'pointers' into the source sub_pathname: full path to output file """ transcript_filename = '12022017 NBPY SCC.scc' # dt = transcript_filename[:8] transcript_pathname = os.path.join(self.show_dir, "assets", "transcripts", transcript_filename) # transcript_start = datetime.datetime.strptime( # dt + " 10:06:56", '%m%d%Y %H:%M:%S' ) - \ # datetime.timedelta(0, 2, 158933) caps = open(transcript_pathname, encoding='iso-8859-1').read() transcript = pycaption.SCCReader().read(caps) language = transcript.get_languages()[0] # ['en-US'] captions = transcript.get_captions(language) out_captions = pycaption.CaptionList() for transcription in transcriptions: state = 0 for c in captions: if c.format_start() == \ transcription['start']['timestamp']: state = 1 offset = c.start - transcription['start'][ 'video_time'] * 1000000 c.nodes[0].content = transcription['start']['text'] if state == 1: if c.format_start() == \ transcription['end']['timestamp']: c.nodes[0].content=\ transcription['end']['text'] state = 0 c.start -= offset c.end -= offset out_captions.append(c) transcript.set_captions(language, out_captions) # writer = pycaption.DFXPWriter() writer = pycaption.SRTWriter() open(sub_pathname, 'wt').write(writer.write(transcript)) return
def write_captions(content, options): if options.scc: print pycaption.SCCWriter().write(content).encode("utf-8") if options.sami: print pycaption.SAMIWriter().write(content).encode("utf-8") if options.dfxp: print pycaption.DFXPWriter().write(content).encode("utf-8") if options.srt: print pycaption.SRTWriter().write(content).encode("utf-8") if options.transcript: print pycaption.TranscriptWriter().write(content).encode("utf-8")
def main(argv): inputfile = '' inputType = '' outputType = '' try: opts, args = getopt.getopt(argv, "h:i:f:t:") except getopt.GetoptError: print('test.py -i <inputfile> -f <intputType> -t <outputType>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -f <intputType> -t <outputType>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-f", "--sfile"): inputType = arg elif opt in ("-t", "--tfile"): outputType = arg if inputType == outputType: print('Error: input type and output type are same format') sys.exit(1) with io.open(inputfile) as f: str1 = f.read() inputValue = inputType.lower() if inputValue == 'scc': c = pycaption.SCCReader().read(str1) elif inputValue == 'srt': c = pycaption.SRTReader().read(str1) elif inputValue == 'dfxp': c = pycaption.DFXPReader().read(str1) elif inputValue == 'webvtt': c = pycaption.WebVTTReader().read(str1) else: print('Error: invalid input type. <srt/scc/webvtt/dfxp> allowed') sys.exit(1) outputValue = outputType.lower() if outputValue == 'scc': print(pycaption.SCCWriter().write(c)) elif outputValue == 'srt': print(pycaption.SRTWriter().write(c)) elif outputValue == 'dfxp': print(pycaption.DFXPWriter().write(c)) elif outputValue == 'webvtt': print(pycaption.WebVTTWriter().write(c)) else: print('Error: invalid output type. <srt/scc/webvtt/dfxp> allowed') sys.exit(1)
def download_subs(link, output_folder, name): x = requests.get( link, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:70.0) Gecko/20100101 Firefox/70.0' }) caption_set = pycaption.DFXPReader().read(x.text) results = pycaption.SRTWriter().write(caption_set) with io.open(os.path.join(output_folder, name + '.srt'), 'w', encoding='utf-8') as f: f.write(results)
def write_captions(content, options): if options.scc: print(pycaption.SCCWriter().write(content)) if options.sami: print(pycaption.SAMIWriter().write(content)) if options.dfxp: print(pycaption.DFXPWriter().write(content)) if options.srt: print(pycaption.SRTWriter().write(content)) if options.vtt: print(pycaption.WebVTTWriter().write(content)) if options.transcript: # import TranscriptWriter only if necessary, since it requires additional dependencies from pycaption.transcript import TranscriptWriter print(TranscriptWriter().write(content))
def write_captions(content, options, lang='', filename=''): if options.sami: print pycaption.SAMIWriter().write(content).encode("utf-8") if options.dfxp: print pycaption.DFXPWriter().write(content).encode("utf-8") if options.webvtt: location = os.getcwd() f = open(location + '/captions/' + filename + '_' + lang + '.vtt', 'w') #Save vtt files into captions folder f.write(pycaption.WebVTTWriter().write(content, lang).encode("utf-8")) # print pycaption.WebVTTWriter().write(content, lang).encode("utf-8") if options.srt: print pycaption.SRTWriter().write(content).encode("utf-8") if options.transcript: print pycaption.TranscriptWriter().write(content).encode("utf-8")
def convert_subs(subtitle_path: Path): """Convert any valid subtitle file to srt subtitles for processing, using pycaption; then process them. Args: subtitle_path -- path of subtitles to convert Return True if successful, False otherwise. """ with open(subtitle_path, encoding='utf-8') as sub_file: subtitles = sub_file.read() subtitle_reader_class = pycaption.detect_format(subtitles) if not subtitle_reader_class: return False subtitle_reader = subtitle_reader_class() srt_subtitles = pycaption.SRTWriter().write(subtitle_reader.read(subtitles)) with open(subtitle_path.with_suffix('.srt'), 'w', encoding='utf-8') as sub_file: sub_file.write(srt_subtitles) srt_to_timestamps(subtitle_path.with_suffix('.srt')) return True
def run(self): for input_file in self.input_files: input_file_name = os.path.basename(input_file) input_type = os.path.splitext(input_file)[1].lower()[1:] output_file = os.path.join( self.output_folder, os.path.splitext(input_file_name)[0] + '.' + self.output_type) output_file_name = os.path.basename(output_file) if os.path.exists(output_file) and not self.overwrite_on: self.log_signal.emit("{}을 건너뜁니다...".format(input_file_name)) continue self.log_signal.emit("{}을 읽습니다...".format(input_file_name)) with open(output_file, 'w', encoding=self.output_encoding) as file_out: reader = None encoding = None content = None with open(input_file, 'rb') as file_in: encoding = chardet.detect(file_in.read())['encoding'] with open(input_file, 'r', encoding=encoding) as file_in: content = file_in.read() if input_type == "smi": reader = pycaption.SAMIReader().read(content) elif input_type == "srt": reader = pycaption.SRTReader().read(content) if self.output_type == "smi": file_out.write(pycaption.SAMIWriter().write(reader)) elif self.output_type == "srt": file_out.write(pycaption.SRTWriter().write(reader)) elif self.output_type == "txt": file_out.write(TextWriter().write(reader)) elif self.output_type == "ats": open(output_file, 'wb').write(AtsWriter().write(reader)) self.log_signal.emit("{}으로 변환했습니다".format(output_file_name))
def download_from_ism(self, url, output_name, output_format): r = self.session.get(f'{url}/manifest') manifest = xmltodict.parse(r.content, force_list={'StreamIndex', 'c'}) self.logger.debug(json.dumps(manifest, indent=4)) for (index, stream) in enumerate( manifest['SmoothStreamingMedia']['StreamIndex']): if stream['@Type'] != 'text': continue lang = stream['@Language'].lower() fmt = stream['QualityLevel']['@FourCC'].upper() if fmt != 'TTML': self.logger.error( f'Stream has unsupported subtitle format: {fmt!r}') sys.exit(1) index -= 2 output = f'{output_name.replace(" ", ".")}.{lang}.{index}.srt' output = pathvalidate.sanitize_filename(output) output = os.path.join(self.output_dir, output) self.logger.info(f'Saving subtitle track #{index} to {output}') path = stream['@Url'].replace('{bitrate}', stream['QualityLevel']['@Bitrate']) t = 0 ts = [] for c in stream['c']: if c.get('@t'): t = int(c['@t']) ts.append(t) if not c.get('@d'): # Stream only has a single segment break for i in range(c.get('@r', 1)): t += int(c['@d']) ts.append(t) ts = ts[:-1] # Remove nonexistent last segment xml = None for (i, t) in enumerate(ts): #print(f'\rDownloading: {t/ts[-1]:.0%}', end='') self.logger.debug(f'Downloading segment {i + 1} of {len(ts)}') seg_url = f'{url}/{path.replace("{start time}", str(t))}' seg = self.session.get(seg_url).content if not seg: # Empty segment continue data = self.ismt_to_ttml(seg).decode('utf-8') assert '{{BR}}' not in data, 'input data contains br placeholder' data = re.sub(r'<br ?/>', '{{BR}}', data) xml_seg = xmltodict.parse( data, force_list={'p'}, process_namespaces=True, namespaces={ 'http://www.w3.org/XML/1998/namespace': None, 'http://www.w3.org/2006/10/ttaf1': None, 'http://www.w3.org/2006/10/ttaf1#metadata': None, 'http://www.w3.org/2006/10/ttaf1#styling': None, }, ) if i == 0: xml = xml_seg fps_base = xml['tt'].get('@ttp:frameRate') fps_mult = xml['tt'].get('@ttp:frameRateMultiplier') if xml['tt']['body']['div'] is None: xml['tt']['body']['div'] = {'p': []} if fps_base: if fps_mult: mult = [int(x) for x in fps_mult.split(' ')] mult = truediv(*mult) else: mult = 1 fps = fps_base * fps_mult else: fps = 30 # Per TTML spec else: div = xml_seg['tt']['body']['div'] if div is None: # Empty subtitle file continue subs = div['p'] scale = int(stream['@TimeScale']) offset = t / scale for p in subs: for a in ('@begin', '@end'): tc = p[a] (h, m, s, f) = [int(x) for x in tc.split(':')] total = round( h * 3600 + m * 60 + s + f / fps + offset, 3) p[a] = f'{total}s' begin = float(p['@begin'][:-1]) end = float(p['@end'][:-1]) if end < begin: self.logger.error( f'End time is earlier than start time ({end} < {begin})', ) return xml['tt']['body']['div']['p'].extend(subs) xml_data = xmltodict.unparse(xml) xml_data = xml_data.replace('{{BR}}', '<br />') os.makedirs(self.output_dir, exist_ok=True) with open(output, 'wb') as fd: if output_format == 'ttml': fd.write(xml_data.encode('utf-8-sig')) elif output_format == 'srt': self.logger.debug('Converting to SRT') r = pycaption.DFXPReader().read(xml_data) w = pycaption.SRTWriter().write(r) fd.write(w.encode('utf-8-sig'))