def generate_srt_from_sjson(sjson_subs): """ Generate transcripts from sjson to SubRip (*.srt) Arguments: sjson_subs (dict): `sjson` subs. Returns: Subtitles in SRT format. """ output = '' equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len(sjson_subs['text']) if not equal_len: return output for i in range(len(sjson_subs['start'])): item = SubRipItem( index=i, start=SubRipTime(milliseconds=sjson_subs['start'][i]), end=SubRipTime(milliseconds=sjson_subs['end'][i]), text=sjson_subs['text'][i] ) output += (str(item)) output += '\n' return output
def generate_srt_from_sjson(sjson_subs, speed): """Generate transcripts with speed = 1.0 from sjson to SubRip (*.srt). :param sjson_subs: "sjson" subs. :param speed: speed of `sjson_subs`. :returns: "srt" subs. """ output = '' equal_len = len(sjson_subs['start']) == len(sjson_subs['end']) == len( sjson_subs['text']) if not equal_len: return output sjson_speed_1 = generate_subs(speed, 1, sjson_subs) for i in range(len(sjson_speed_1['start'])): item = SubRipItem( index=i, start=SubRipTime(milliseconds=sjson_speed_1['start'][i]), end=SubRipTime(milliseconds=sjson_speed_1['end'][i]), text=sjson_speed_1['text'][i]) output += (unicode(item)) output += '\n' return output
def tick(): global subs global player global last_played global TICK_TIME, DEBUG # print(subs[0]) t = perf_counter() # ts = str(timedelta(seconds=t)).replace('.',',') # tsd = str(timedelta(seconds=t+10*TICK_TIME)).replace('.',',') ts = SubRipTime(seconds = t) tsd = SubRipTime(seconds = t+1*TICK_TIME) #print(dir(player)) pp = player.get_position() ptms = player.get_time()/1000.0 pt = SubRipTime(seconds=(player.get_time()/1000.0)) ptd = SubRipTime(seconds=(player.get_time()/1000.0+1*TICK_TIME)) if DEBUG: print('Time: %s | %s | %s - %s | %s - %s | %s | %s' % (datetime.now(),t,ts,tsd,pt,ptd,pp,ptms)) print('Finding subtitle starting at %s and ending at %s' % (pt, ptd)) # sub, i = find_subtitle(subs, ts, tsd) sub, i = find_subtitle(subs, pt, ptd, lo=last_played) # sub, i = find_subtitle(subs, pt, ptd) # sub_list = find_subtitles(subs, pt, ptd, lo=last_played) if DEBUG: print('Result of find_subtitle: ', i) # print('Result of find_subtitles: ', len(sub_list)) # hours, minutes, seconds, milliseconds = time_convert(sub.start) # t = seconds + minutes*60 + hours*60*60 + milliseconds/1000.0 if sub!="": # and i > last_played: print("Light event:", i, sub) # print("Trigger light event %s" % i) trigger_light(sub) # sleep(.1) last_played=i
def triggerPreviousEvent(self, pos): if LIGHTING_MSGS: print("Finding last lighting command from pos: ", pos) pp = pos pt = SubRipTime(seconds=pp) ptd = SubRipTime(seconds=(pp + 1 * TICK_TIME)) if VERBOSE and DEBUG: print("Finding last light event, starting from: ") print("pt: ", ptd) print("ptd: ", ptd) sub, i = self.find_subtitle(self.subs, pt, ptd, backwards=True) if LIGHTING_MSGS: print("Seeking, found sub:", sub, " at pos: ", i) if sub != "": #and i > self.last_played: if LIGHTING_MSGS and DEBUG: print(i, "Found last lighting event!:", sub) # print("Trigger light event %s" % i) self.trigger_light(sub) self.last_played = i if DEBUG: print('last_played: ', i)
def tick(): global subs global player global last_played global TICK_TIME # print(subs[0]) t = perf_counter() # ts = str(timedelta(seconds=t)).replace('.',',') # tsd = str(timedelta(seconds=t+10*TICK_TIME)).replace('.',',') ts = SubRipTime(seconds = t) tsd = SubRipTime(seconds = t+1*TICK_TIME) # print(dir(player)) pp = player.get_position() ptms = player.get_time()/1000.0 pt = SubRipTime(seconds=(player.get_time()/1000.0)) ptd = SubRipTime(seconds=(player.get_time()/1000.0+1*TICK_TIME)) print('Time: %s | %s | %s - %s | %s - %s | %s | %s' % (datetime.now(),t,ts,tsd,pt,ptd,pp,ptms)) # sub, i = find_subtitle(subs, ts, tsd) sub, i = find_subtitle(subs, pt, ptd) # hours, minutes, seconds, milliseconds = time_convert(sub.start) # t = seconds + minutes*60 + hours*60*60 + milliseconds/1000.0 print("Subtitle:", sub, i) if sub!="" and i > last_played: trigger_light_hue(sub) last_played=i
def merge_srt(chn_file, eng_file, output_file): delta = SubRipTime(milliseconds=500) subs_a = SubRipFile.open(chn_file) subs_b = SubRipFile.open(eng_file) out = merge_subtitle(subs_a, subs_b, delta) if os.path.isfile(output_file): os.remove(output_file) out.save(output_file, encoding='utf8')
def offset(self): d = self.media.offset hours, remainder = divmod(d.seconds, 3600) minutes, seconds = divmod(remainder, 60) return SubRipTime(hours=hours, minutes=minutes, seconds=seconds, milliseconds=d.microseconds / 1000)
def test_from_time(self): time_obj = time(1, 2, 3, 4000) self.assertEqual(SubRipTime(1, 2, 3, 4), time_obj) self.assertTrue(SubRipTime(1, 2, 3, 5) >= time_obj) self.assertTrue(SubRipTime(1, 2, 3, 3) <= time_obj) self.assertTrue(SubRipTime(1, 2, 3, 0) != time_obj) self.assertEqual(SubRipTime(1, 2, 3, 4).to_time(), time_obj) self.assertTrue(SubRipTime(1, 2, 3, 5).to_time() >= time_obj) self.assertTrue(SubRipTime(1, 2, 3, 3).to_time() <= time_obj) self.assertTrue(SubRipTime(1, 2, 3, 0).to_time() != time_obj)
def test_from_tuple(self): self.assertEqual((0, 0, 0, 0), SubRipTime()) self.assertEqual((0, 0, 0, 1), SubRipTime(milliseconds=1)) self.assertEqual((0, 0, 2, 0), SubRipTime(seconds=2)) self.assertEqual((0, 3, 0, 0), SubRipTime(minutes=3)) self.assertEqual((4, 0, 0, 0), SubRipTime(hours=4)) self.assertEqual((1, 2, 3, 4), SubRipTime(1, 2, 3, 4))
def test_from_dict(self): self.assertEqual(dict(), SubRipTime()) self.assertEqual(dict(milliseconds=1), SubRipTime(milliseconds=1)) self.assertEqual(dict(seconds=2), SubRipTime(seconds=2)) self.assertEqual(dict(minutes=3), SubRipTime(minutes=3)) self.assertEqual(dict(hours=4), SubRipTime(hours=4)) self.assertEqual(dict(hours=1, minutes=2, seconds=3, milliseconds=4), SubRipTime(1, 2, 3, 4))
def generate_srt(self, text: str): """ Generates .srt file with the given text and timestamps. :param text: String with all retrieved text. """ self.create_subs_path() subs = open_srt(self.srt_path) texts = self.prepare_text(text.split(" ")) timestamps = self.prepare_timestamps(texts) for i, (sentence, (start_timestamp, end_timestamp)) in enumerate(zip(texts, timestamps)): start_timestamp_list = [ int(ts) for ts in start_timestamp.split(':') ] end_timestamp_list = [int(ts) for ts in end_timestamp.split(':')] sub = SubRipItem(index=i) sub.text = sentence sub.start = SubRipTime(hours=start_timestamp_list[0], minutes=start_timestamp_list[1], seconds=start_timestamp_list[2], milliseconds=start_timestamp_list[3]) sub.end = SubRipTime(hours=end_timestamp_list[0], minutes=end_timestamp_list[1], seconds=end_timestamp_list[2], milliseconds=end_timestamp_list[3]) subs.append(sub) # Saving result subtitles into file subs.save(self.srt_path, encoding='utf-8') logging.info(f"Generated subtitles are saved in {self.srt_path}")
def get_captions(client_name, clip_id): h = httplib2.Http() g_url = 'http://%s/JSON.php?clip_id=%s' % ( client_name, clip_id) print "Fetching URL: %s" % g_url response, j = h.request(g_url) dirname = os.getcwd() + "/data/granicus/srt/%s/" % client_name filename = dirname + "%s.srt" % clip_id subs = SubRipFile() if response.get('status') == '200': captions = [] try: j = json.loads(j, strict=False)[0] except ValueError: ts = re.sub('([{,]\s+)([a-z]+)(: ")', lambda s: '%s"%s"%s' % (s.groups()[0], s.groups()[1], s.groups()[2]), j).replace("\\", "") try: j = json.loads(ts, strict=False)[0] except UnicodeDecodeError: ts = unicode(ts, errors='ignore') j = json.loads(ts, strict=False)[0] except: j = False sub_count = 0 for item in j: if item["type"] == "text": cap = item["text"] offset = round(float(item["time"]), 3) captions.append({'time': offset, 'text': cap}) end = get_cap_end(j, sub_count) if end: subtitle = SubRipItem(index=sub_count, start=SubRipTime(seconds=offset), end=SubRipTime(seconds=end), text=cap) subs.append(subtitle) sub_count = sub_count + 1 try: subs.save(path=filename, encoding="utf-8") except IOError: p = subprocess.Popen('mkdir -p %s' % dirname, shell=True, stdout=subprocess.PIPE) t = p.wait() subs.save(path=filename, encoding="utf-8") s3_url = push_to_s3(filename, '%s/%s.srt' % (client_name, clip_id)) return (captions, s3_url) else: return ([], '')
srt = SubRipFile() # get all DisplaySets that contain an image print("Loading DisplaySets...") allsets = [ds for ds in tqdm(pgs.iter_displaysets())] print(f"Running OCR on {len(allsets)} DisplaySets and building SRT file...") subText = "" subStart = 0 subIndex = 0 for ds in tqdm(allsets): if ds.has_image: # get Palette Display Segment pds = ds.pds[0] # get Object Display Segment ods = ds.ods[0] # img = make_image(ods, pds) # subText = pytesseract.image_to_string(img) subStart = ods.presentation_timestamp else: startTime = SubRipTime(milliseconds=int(subStart)) endTime = SubRipTime( milliseconds=int(ds.end[0].presentation_timestamp)) srt.append(SubRipItem(subIndex, startTime, endTime, "subText")) subIndex += 1 print(f"Done. SRT file saved as {srtFile}") srt.save(srtFile, encoding='utf-8')
import sys import os from pysrt import SubRipFile # https://github.com/byroot/pysrt from pysrt import SubRipItem from pysrt import SubRipTime from textAnalyse import analyzeSubLevel from fixEncoding import makeFileUtf8Bom from syncSrts import syncSrts delta = SubRipTime(milliseconds=500) encoding = "utf_8" this = sys.modules[__name__] this.L1_sub_template = "{}" this.L2_sub_template = "{}" level_criterias = { '1': { 'max_CEFR_level': 'A1', # lines with CEFR level > this will not be hidden 'max_flesh_kincade_grade': 4, # lines with fk grade > this will not be hidden 'max_characters': 30, # lines with more characters than this will never be hidden 'max_words': 8, # lines with more words than this will never be hidden }, '2': {
def merge_video_subtitle(video_id): """ 将video_id的中英vtt字幕转换为srt字幕,然后合并为srt格式的字幕 :param video_id: :return: """ video = Video.objects.get(pk=video_id) # Settings default values delta = SubRipTime(milliseconds=500) encoding = "utf_8" if (video.subtitle_cn != '') & (video.subtitle_en != ''): # convert_file(input_captions = video.subtitle_cn, output_writer) # vtt格式的字幕 # subs_cn_vtt = SubRipFile.open(video.subtitle_cn.path, # encoding=encoding) # subs_en_vtt = SubRipFile.open(video.subtitle_en.path, # encoding=encoding) # 将vtt字幕转换为srt subs_cn_srt_filename = '%s-%s.cn.srt' % (get_valid_filename( video.title), video.video_id) subs_cn_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, subs_cn_srt_filename) # 此功能失效 # subs_cn_srt_result = convert_file( # input_captions=video.subtitle_cn.path,output_writer=subs_cn_srt) subs_cn_srt_result = convert_subtilte_format( srt_file=video.subtitle_cn.path, ass_file=subs_cn_srt_path) subs_en_srt_filename = '%s-%s.en.srt' % (get_valid_filename( video.title), video.video_id) subs_en_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, subs_en_srt_filename) # subs_en_srt_result = convert_file( # input_captions=video.subtitle_en.path,output_writer = subs_en_srt) subs_en_srt_path = convert_subtilte_format( srt_file=video.subtitle_en.path, ass_file=subs_en_srt_path) subs_cn_srt = SubRipFile.open(subs_cn_srt_path, encoding=encoding) subs_en_srt = SubRipFile.open(subs_en_srt_path, encoding=encoding) merge_subs = merge_subtitle(subs_cn_srt, subs_en_srt, delta) # 某些youtube视频的title有非ASCII的字符,或者/等不能出现在文件名中的字符 # 所以使用django utils自带的get_valid_filename()转化一下 # 注意:与youtube-dl自带的restrictfilenames获得的文件名不一样, # 也就是merge_subs_filename 与 subtitle_cn, subtitle_cn中名称可能会不一样 # 标题中的 . 依然会保留 merge_subs_filename = '%s-%s.zh-Hans.en.srt' % (get_valid_filename( video.title), video.video_id) merge_subs_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, merge_subs_filename) merge_subs.save(merge_subs_path, encoding=encoding) video.subtitle_merge = merge_subs_path video.save(update_fields=['subtitle_merge']) return merge_subs_path else: return False
def test_negative_serialization(self): self.assertEqual('00:00:00,000', str(SubRipTime(-1, 2, 3, 4)))
def tick(self): # Leaving the comments below in for Francesco, they could be part of # a mysterious but useful debug strategy # try: if True: # print(subs[0]) t = perf_counter() # ts = str(timedelta(seconds=t)).replace('.',',') # tsd = str(timedelta(seconds=t+10*TICK_TIME)).replace('.',',') ts = SubRipTime(seconds=t) tsd = SubRipTime(seconds=t + (1 * TICK_TIME)) # print(dir(player)) try: pp = self.player.getPosition() except Exception as e: print( "Could not get the current position of the player, shutting down lighting gracefully..." ) logging.error(e) self.__del__() #ptms = player.get_time()/1000.0 #pt = SubRipTime(seconds=(player.get_time()/1000.0)) #ptd = SubRipTime(seconds=(player.get_time()/1000.0+1*TICK_TIME)) pt = SubRipTime(seconds=pp) ptd = SubRipTime(seconds=(pp + 1 * TICK_TIME)) if DEBUG: #print('Time: %s | %s | %s - %s | %s - %s | %s | %s' % (datetime.now(),t,ts,tsd,pt,ptd,pp,ptms)) # print('Time: %s | %s | %s | %s | %s | %s | %s ' % (datetime.now(),t,ts,tsd,pp,pt,ptd)) pass ## sub, i = self.find_subtitle(subs, ts, tsd) # sub, i = self.find_subtitle(self.subs, pt, ptd) sub, i = self.find_subtitle(self.subs, pt, ptd, lo=self.last_played) if DEBUG: print(i, "Found Subtitle for light event:", sub, i) ## hours, minutes, seconds, milliseconds = time_convert(sub.start) ## t = seconds + minutes*60 + hours*60*60 + milliseconds/1000.0 if sub != "": #and i > self.last_played: if LIGHTING_MSGS and DEBUG: print(i, "Light event:", sub) # print("Trigger light event %s" % i) self.trigger_light(sub) self.last_played = i if DEBUG: print('last_played: ', i) pod_mode = MENU_DMX_VAL != None if self.dmx_interpolator.isRunning() and pod_mode is False: if self.PLAY_DMX: if self.dmx != None: iFrame = self.dmx_interpolator.getInterpolatedFrame(pt) self.dmx.write_frame(iFrame)
def setUp(self): self.time = SubRipTime()
def test_mul(self): self.assertEqual(self.time * 2, SubRipTime(2, 4, 6, 8)) self.assertEqual(self.time * 0.5, (0, 31, 1, 502))
def setUp(self): self.time = SubRipTime(1, 2, 3, 4)
def test_from_ordinal(self): self.assertEqual(SubRipTime.from_ordinal(3600000), {'hours': 1}) self.assertEqual(SubRipTime(1), 3600000)
def test_parsing(self): for time_string, time_items in self.KNOWN_VALUES: self.assertEqual(time_string, SubRipTime(*time_items))
def test_negative_serialization(self): self.assertEquals(u'00:00:00,000', unicode(SubRipTime(-1, 2, 3, 4)))
def test_serialization(self): for time_string, time_items in self.KNOWN_VALUES: self.assertEqual(time_string, str(SubRipTime(*time_items)))
print " --delta=<milliseconds> default: 500" print " -e <encoding> Encoding of input and output files." print " --encoding=<encoding> default: utf_8" def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'hd:e:', ["help", "encoding=", "delta="]) except getopt.GetoptError, err: print str(err) usage() sys.exit(2) #Settings default values delta = SubRipTime(milliseconds=500) encoding = "utf_8" #- if len(args) <> 3: usage() sys.exit(2) for o, a in opts: if o in ("-d", "--delta"): delta = SubRipTime(milliseconds=int(a)) elif o in ("-e", "--encoding"): encoding = a elif o in ("-h", "--help"): usage() sys.exit()