def main(): parser = argparse.ArgumentParser(description='Auto translate jp CCs in MPEG TS file.') parser.add_argument('infile', help='Input filename (MPEG2 Transport Stream File)', type=str) parser.add_argument('pid', help='Pid of closed caption ES to extract from stream.', type=int) #parser.add_argument('-k', '--secret_key', help='Windows secret key for bing translate API.', type=str, default='') args = parser.parse_args() pid = args.pid infilename = args.infile if not os.path.exists(infilename): print 'Please provide input Transport Stream file.' os.exit(-1) #open an Ass file and formatter ass_file = ASSFile(infilename+'_ENG.ass') ass = ASSFormatter(ass_file) #CC data is not, in itself timestamped, so we've got to use packet info #to reconstruct the timing of the closed captions (i.e. how many seconds into #the file are they shown?) initial_timestamp = 0 pes_packet = None pes = [] elapsed_time_s = 0 for packet in next_ts_packet(infilename): #always process timestamp info, regardless of PID if packet.adapatation_field() and packet.adapatation_field().PCR(): current_timestamp = packet.adapatation_field().PCR() initial_timestamp = initial_timestamp or current_timestamp delta = current_timestamp - initial_timestamp elapsed_time_s = float(delta)/90000.0 #if this is the stream PID we're interestd in, reconstruct the ES if packet.pid() == pid: if packet.payload_start(): pes = copy.deepcopy(packet.payload()) else: pes.extend(packet.payload()) pes_packet = PESPacket(pes) #if our packet is fully formed (payload all present) we can parse its contents if pes_packet.length() == (pes_packet.header_size() + pes_packet.payload_size()): data_group = DataGroup(pes_packet.payload()) if not data_group.is_management_data(): #We now have a Data Group that contains caption data. #We take out its payload, but this is further divided into 'Data Unit' structures caption = data_group.payload() #iterate through the Data Units in this payload via another generator. for data_unit in next_data_unit(caption): #we're only interested in those Data Units which are "statement body" to get CC data. if not isinstance(data_unit.payload(), StatementBody): continue ass.format(data_unit.payload().payload(), elapsed_time_s)
def OnESPacket(current_pid, packet, header_size): """ Callback invoked on the successful extraction of an Elementary Stream packet from the Transport Stream file packets. :param current_pid: The TS Program ID for the TS packets this info originated from :param packet: The ENTIRE ES packet, header and payload-- which may have been assembled from multiple TS packet payloads. :param header_size: Size of the header in bytes (characters in the string). Provided to more easily separate the packet into header and payload. :return: None """ global pid global VERBOSE global SILENT global elapsed_time_s if pid >= 0 and current_pid != pid: return try: payload = ES.get_pes_payload(packet) f = list(payload) #f = bytearray(payload) data_group = DataGroup(f) if not data_group.is_management_data(): #We now have a Data Group that contains caption data. #We take out its payload, but this is further divided into 'Data Unit' structures caption = data_group.payload() #iterate through the Data Units in this payload via another generator. for data_unit in next_data_unit(caption): #we're only interested in those Data Units which are "statement body" to get CC data. if not isinstance(data_unit.payload(), StatementBody): continue #okay. Finally we've got a data unit with CC data. Feed its payload to the custom if pid < 0 and VERBOSE and not SILENT: pid = current_pid print("Found Closed Caption data in PID: " + str(pid)) print("Will now only process this PID to improve performance.") #formatter function above. This dumps the basic text to stdout. cc = formatter(data_unit.payload().payload(), elapsed_time_s) if cc and VERBOSE: #according to best practice, always deal internally with UNICODE, and encode to #your encoding of choice as late as possible. Here, i'm encoding as UTF-8 for #my command line. #DECODE EARLY, ENCODE LATE print(cc.encode('utf-8')) except EOFError: pass except Exception, err: if VERBOSE and not SILENT and pid >= 0: print("Exception thrown while handling DataGroup in ES. This may be due to many factors" + "such as file corruption or the .ts file using as yet unsupported features.") traceback.print_exc(file=sys.stdout)
def OnESPacket(current_pid, packet, header_size): """ Callback invoked on the successful extraction of an Elementary Stream packet from the Transport Stream file packets. :param current_pid: The TS Program ID for the TS packets this info originated from :param packet: The ENTIRE ES packet, header and payload-- which may have been assembled from multiple TS packet payloads. :param header_size: Size of the header in bytes (characters in the string). Provided to more easily separate the packet into header and payload. :return: None """ global pid global VERBOSE global SILENT global elapsed_time_s global ass global infilename global outfilename global tmax global time_offset if pid >= 0 and current_pid != pid: return try: payload = ES.get_pes_payload(packet) f = list(payload) data_group = DataGroup(f) if not data_group.is_management_data(): #We now have a Data Group that contains caption data. #We take out its payload, but this is further divided into 'Data Unit' structures caption = data_group.payload() #iterate through the Data Units in this payload via another generator. for data_unit in next_data_unit(caption): #we're only interested in those Data Units which are "statement body" to get CC data. if not isinstance(data_unit.payload(), StatementBody): continue if not ass: v = not SILENT ass = ASSFormatter(tmax=tmax, video_filename=outfilename, verbose=v) ass.format(data_unit.payload().payload(), elapsed_time_s) # this code used to sed the PID we're scanning via first successful ARIB decode # but i've changed it below to draw present CC language info form ARIB # management data. Leaving this here for reference. #if pid < 0 and not SILENT: # pid = current_pid # print("Found Closed Caption data in PID: " + str(pid)) # print("Will now only process this PID to improve performance.") else: # management data management_data = data_group.payload() numlang = management_data.num_languages() if pid < 0 and numlang > 0: for language in range(numlang): if not SILENT: print("Closed caption management data for language: " + management_data.language_code(language) + " available in PID: " + str(current_pid)) print("Will now only process this PID to improve performance.") pid = current_pid except EOFError: pass except FileOpenError as ex: # allow IOErrors to kill application raise ex except Exception, err: if not SILENT and pid >= 0: print("Exception thrown while handling DataGroup in ES. This may be due to many factors" + "such as file corruption or the .ts file using as yet unsupported features.") traceback.print_exc(file=sys.stdout)
def main(): parser = argparse.ArgumentParser(description='Draw CC Packets from MPG2 Transport Stream file.') parser.add_argument('infile', help='Input filename (MPEG2 Transport Stream File)', type=str) parser.add_argument('pid', help='Pid of closed caption ES to extract from stream.', type=int) args = parser.parse_args() pid = args.pid infilename = args.infile if not os.path.exists(infilename): print 'Please provide input Transport Stream file.' os.exit(-1) #open an Ass file and formatter ass_file = ASSFile(infilename+'.ass') ass = ASSFormatter(ass_file) #CC data is not, in itself timestamped, so we've got to use packet info #to reconstruct the timing of the closed captions (i.e. how many seconds into #the file are they shown?) initial_timestamp = 0 pes_packet = None pes = [] elapsed_time_s = 0 for packet in next_ts_packet(infilename): #always process timestamp info, regardless of PID if packet.adapatation_field() and packet.adapatation_field().PCR(): current_timestamp = packet.adapatation_field().PCR() initial_timestamp = initial_timestamp or current_timestamp delta = current_timestamp - initial_timestamp elapsed_time_s = float(delta)/90000.0 #if this is the stream PID we're interestd in, reconstruct the ES if packet.pid() == pid: try: if packet.payload_start(): pes = copy.deepcopy(packet.payload()) else: pes.extend(packet.payload()) pes_packet = PESPacket(pes) #if our packet is fully formed (payload all present) we can parse its contents if pes_packet.length() == (pes_packet.header_size() + pes_packet.payload_size()): data_group = DataGroup(pes_packet.payload()) if not data_group.is_management_data(): #We now have a Data Group that contains caption data. #We take out its payload, but this is further divided into 'Data Unit' structures caption = data_group.payload() #iterate through the Data Units in this payload via another generator. for data_unit in next_data_unit(caption): #we're only interested in those Data Units which are "statement body" to get CC data. if not isinstance(data_unit.payload(), StatementBody): continue ass.format(data_unit.payload().payload(), elapsed_time_s) #okay. Finally we've got a data unit with CC data. Feed its payload to the custom #formatter function above. This dumps the basic text to stdout. #cc = formatter(data_unit.payload().payload(), elapsed_time_s) #if cc: #according to best practice, always deal internally with UNICODE, and encode to #your encoding of choice as late as possible. Here, i'm encoding as UTF-8 for #my command line. #DECODE EARLY, ENCODE LATE #print(cc.encode('utf-8')) except: pass
def main(): parser = argparse.ArgumentParser( description= 'Remove ARIB formatted Closed Caption information from an MPEG TS file and format the results as a standard .ass subtitle file.' ) parser.add_argument('infile', help='Input filename (MPEG2 Transport Stream File)', type=str) parser.add_argument( '-p', '--pid', help= 'Specify a PID of a PES known to contain closed caption info (tool will attempt to find the proper PID if not specified.).', type=int, default=-1) parser.add_argument('-v', '--verbose', help='Verbose output.', action='store_true') parser.add_argument('-q', '--quiet', help='Does not write to stdout.', action='store_true') parser.add_argument('-t', '--tmax', help='Subtitle display time limit (seconds).', type=int, default=5) parser.add_argument( '-o', '--timeoffset', help= 'Shift all time values in generated .ass file by indicated floating point offset in seconds.', type=float, default=0.0) args = parser.parse_args() pid = args.pid infilename = args.infile quiet = args.quiet verbose = args.verbose tmax = args.tmax time_offset = args.timeoffset if not os.path.exists(infilename): print 'Input filename :' + infilename + " does not exist." os.exit(-1) #open an Ass file and formatter ass_file = None #ASSFile(infilename+'.ass') ass = None #ASSFormatter(ass_file, tmax=tmax) #CC data is not, in itself timestamped, so we've got to use packet info #to reconstruct the timing of the closed captions (i.e. how many seconds into #the file are they shown?) initial_timestamp = 0 pes_packet = None pes = [] elapsed_time_s = 0 # get filesize for progress meter total_filesize = os.path.getsize(infilename) read_size = 0 percent_read = 0 prev_percent_read = percent_read if not quiet and not verbose: #show initial progress information sys.stdout.write("progress: %d%% \r" % (percent_read)) sys.stdout.flush() for packet in next_ts_packet(infilename): read_size += TSPacket.PACKET_SIZE_BYTES percent_read = ((read_size / float(total_filesize)) * 100) new_percent_read = int(percent_read * 100) if not quiet and not verbose and new_percent_read != prev_percent_read: prev_percent_read = new_percent_read #print("totalsize:"+str(total_filesize)+" read_size "+str(read_size) + " percent: " + str(new_percent_read)) sys.stdout.write("progress: %.2f%% \r" % (percent_read)) sys.stdout.flush() #always process timestamp info, regardless of PID if packet.adapatation_field() and packet.adapatation_field().PCR(): current_timestamp = packet.adapatation_field().PCR() initial_timestamp = initial_timestamp or current_timestamp delta = current_timestamp - initial_timestamp elapsed_time_s = float(delta) / 90000.0 + time_offset #if this is the stream PID we're interestd in, reconstruct the ES if pid < 0 or (pid == packet.pid()): try: if packet.payload_start(): pes = copy.deepcopy(packet.payload()) else: pes.extend(packet.payload()) pes_packet = PESPacket(pes) #if our packet is fully formed (payload all present) we can parse its contents if pes_packet.length() == (pes_packet.header_size() + pes_packet.payload_size()): data_group = DataGroup(pes_packet.payload()) if not data_group.is_management_data(): #We now have a Data Group that contains caption data. #We take out its payload, but this is further divided into 'Data Unit' structures caption = data_group.payload() #iterate through the Data Units in this payload via another generator. for data_unit in next_data_unit(caption): #we're only interested in those Data Units which are "statement body" to get CC data. if not isinstance(data_unit.payload(), StatementBody): continue # only write the file if we've actually found some Closed Captions if not ass_file: ass_file = ASSFile(infilename + '.ass') if not ass: ass = ASSFormatter(ass_file, tmax=tmax, video_filename=infilename) ass.format(data_unit.payload().payload(), elapsed_time_s) if pid < 0: pid = packet.pid() print("Found Closed Caption data in PID: " + str(pid)) print( "Will now only process this PID to improve performance." ) #print("properly formed packet with pid: "+ str(packet.pid())) #okay. Finally we've got a data unit with CC data. Feed its payload to the custom #formatter function above. This dumps the basic text to stdout. #cc = formatter(data_unit.payload().payload(), elapsed_time_s) #if cc: #according to best practice, always deal internally with UNICODE, and encode to #your encoding of choice as late as possible. Here, i'm encoding as UTF-8 for #my command line. #DECODE EARLY, ENCODE LATE #print(cc.encode('utf-8')) except: #print("exception thrown on packet with PID: " + str(packet.pid())) pass if pid < 0 or not ass: print("Did not find any Closed Caption data in the file " + infilename)
def main(): parser = argparse.ArgumentParser( description='Auto translate jp CCs in MPEG TS file.') parser.add_argument('infile', help='Input filename (MPEG2 Transport Stream File)', type=str) parser.add_argument( 'pid', help='Pid of closed caption ES to extract from stream.', type=int) #parser.add_argument('-k', '--secret_key', help='Windows secret key for bing translate API.', type=str, default='') args = parser.parse_args() pid = args.pid infilename = args.infile if not os.path.exists(infilename): print 'Please provide input Transport Stream file.' os.exit(-1) #open an Ass file and formatter ass_file = ASSFile(infilename + '_ENG.ass') ass = ASSFormatter(ass_file) #CC data is not, in itself timestamped, so we've got to use packet info #to reconstruct the timing of the closed captions (i.e. how many seconds into #the file are they shown?) initial_timestamp = 0 pes_packet = None pes = [] elapsed_time_s = 0 for packet in next_ts_packet(infilename): #always process timestamp info, regardless of PID if packet.adapatation_field() and packet.adapatation_field().PCR(): current_timestamp = packet.adapatation_field().PCR() initial_timestamp = initial_timestamp or current_timestamp delta = current_timestamp - initial_timestamp elapsed_time_s = float(delta) / 90000.0 #if this is the stream PID we're interestd in, reconstruct the ES if packet.pid() == pid: if packet.payload_start(): pes = copy.deepcopy(packet.payload()) else: pes.extend(packet.payload()) pes_packet = PESPacket(pes) #if our packet is fully formed (payload all present) we can parse its contents if pes_packet.length() == (pes_packet.header_size() + pes_packet.payload_size()): data_group = DataGroup(pes_packet.payload()) if not data_group.is_management_data(): #We now have a Data Group that contains caption data. #We take out its payload, but this is further divided into 'Data Unit' structures caption = data_group.payload() #iterate through the Data Units in this payload via another generator. for data_unit in next_data_unit(caption): #we're only interested in those Data Units which are "statement body" to get CC data. if not isinstance(data_unit.payload(), StatementBody): continue ass.format(data_unit.payload().payload(), elapsed_time_s)